Total coverage: 291937 (19%)of 1584726
62 4 60 14 22 47 186 1 1 1 59 39 8 37 39 2 51 4 51 45 502 478 478 477 274 332 333 92 299 110 9 286 416 270 384 97 149 323 57 289 110 144 344 57 57 57 66 30 65 66 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/fat/misc.c * * Written 1992,1993 by Werner Almesberger * 22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980 * and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru) */ #include "fat.h" #include <linux/iversion.h> /* * fat_fs_error reports a file system problem that might indicate fa data * corruption/inconsistency. Depending on 'errors' mount option the * panic() is called, or error message is printed FAT and nothing is done, * or filesystem is remounted read-only (default behavior). * In case the file system is remounted read-only, it can be made writable * again by remounting it. */ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) { struct fat_mount_options *opts = &MSDOS_SB(sb)->options; va_list args; struct va_format vaf; if (report) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; fat_msg(sb, KERN_ERR, "error, %pV", &vaf); va_end(args); } if (opts->errors == FAT_ERRORS_PANIC) panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); else if (opts->errors == FAT_ERRORS_RO && !sb_rdonly(sb)) { sb->s_flags |= SB_RDONLY; fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); } } EXPORT_SYMBOL_GPL(__fat_fs_error); /** * fat_msg() - print preformated FAT specific messages. Every thing what is * not fat_fs_error() should be fat_msg(). */ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ int fat_clusters_flush(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct fat_boot_fsinfo *fsinfo; if (!is_fat32(sbi)) return 0; bh = sb_bread(sb, sbi->fsinfo_sector); if (bh == NULL) { fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush"); return -EIO; } fsinfo = (struct fat_boot_fsinfo *)bh->b_data; /* Sanity check */ if (!IS_FSINFO(fsinfo)) { fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: " "0x%08x, 0x%08x (sector = %lu)", le32_to_cpu(fsinfo->signature1), le32_to_cpu(fsinfo->signature2), sbi->fsinfo_sector); } else { if (sbi->free_clusters != -1) fsinfo->free_clusters = cpu_to_le32(sbi->free_clusters); if (sbi->prev_free != -1) fsinfo->next_cluster = cpu_to_le32(sbi->prev_free); mark_buffer_dirty(bh); } brelse(bh); return 0; } /* * fat_chain_add() adds a new cluster to the chain of clusters represented * by inode. */ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); int ret, new_fclus, last; /* * We must locate the last cluster of the file to add this new * one (new_dclus) to the end of the link list (the FAT). */ last = new_fclus = 0; if (MSDOS_I(inode)->i_start) { int fclus, dclus; ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus); if (ret < 0) return ret; new_fclus = fclus + 1; last = dclus; } /* add new one to the last of the cluster chain */ if (last) { struct fat_entry fatent; fatent_init(&fatent); ret = fat_ent_read(inode, &fatent, last); if (ret >= 0) { int wait = inode_needs_sync(inode); ret = fat_ent_write(inode, &fatent, new_dclus, wait); fatent_brelse(&fatent); } if (ret < 0) return ret; /* * FIXME:Although we can add this cache, fat_cache_add() is * assuming to be called after linear search with fat_cache_id. */ // fat_cache_add(inode, new_fclus, new_dclus); } else { MSDOS_I(inode)->i_start = new_dclus; MSDOS_I(inode)->i_logstart = new_dclus; /* * Since generic_write_sync() synchronizes regular files later, * we sync here only directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { ret = fat_sync_inode(inode); if (ret) return ret; } else mark_inode_dirty(inode); } if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) { fat_fs_error(sb, "clusters badly computed (%d != %llu)", new_fclus, (llu)(inode->i_blocks >> (sbi->cluster_bits - 9))); fat_cache_inval_inode(inode); } inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9); return 0; } /* * The epoch of FAT timestamp is 1980. * : bits : value * date: 0 - 4: day (1 - 31) * date: 5 - 8: month (1 - 12) * date: 9 - 15: year (0 - 127) from 1980 * time: 0 - 4: sec (0 - 29) 2sec counts * time: 5 - 10: min (0 - 59) * time: 11 - 15: hour (0 - 23) */ #define SECS_PER_MIN 60 #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) /* days between 1.1.70 and 1.1.80 (2 leap days) */ #define DAYS_DELTA (365 * 10 + 2) /* 120 (2100 - 1980) isn't leap year */ #define YEAR_2100 120 #define IS_LEAP_YEAR(y) (!((y) & 3) && (y) != YEAR_2100) /* Linear day numbers of the respective 1sts in non-leap years. */ static long days_in_year[] = { /* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, }; static inline int fat_tz_offset(struct msdos_sb_info *sbi) { return (sbi->options.tz_set ? -sbi->options.time_offset : sys_tz.tz_minuteswest) * SECS_PER_MIN; } /* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs) { u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date); time64_t second; long day, leap_day, month, year; year = date >> 9; month = max(1, (date >> 5) & 0xf); day = max(1, date & 0x1f) - 1; leap_day = (year + 3) / 4; if (year > YEAR_2100) /* 2100 isn't leap year */ leap_day--; if (IS_LEAP_YEAR(year) && month > 2) leap_day++; second = (time & 0x1f) << 1; second += ((time >> 5) & 0x3f) * SECS_PER_MIN; second += (time >> 11) * SECS_PER_HOUR; second += (time64_t)(year * 365 + leap_day + days_in_year[month] + day + DAYS_DELTA) * SECS_PER_DAY; second += fat_tz_offset(sbi); if (time_cs) { ts->tv_sec = second + (time_cs / 100); ts->tv_nsec = (time_cs % 100) * 10000000; } else { ts->tv_sec = second; ts->tv_nsec = 0; } } /* Export fat_time_fat2unix() for the fat_test KUnit tests. */ EXPORT_SYMBOL_GPL(fat_time_fat2unix); /* Convert linear UNIX date to a FAT time/date pair. */ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs) { struct tm tm; time64_to_tm(ts->tv_sec, -fat_tz_offset(sbi), &tm); /* FAT can only support year between 1980 to 2107 */ if (tm.tm_year < 1980 - 1900) { *time = 0; *date = cpu_to_le16((0 << 9) | (1 << 5) | 1); if (time_cs) *time_cs = 0; return; } if (tm.tm_year > 2107 - 1900) { *time = cpu_to_le16((23 << 11) | (59 << 5) | 29); *date = cpu_to_le16((127 << 9) | (12 << 5) | 31); if (time_cs) *time_cs = 199; return; } /* from 1900 -> from 1980 */ tm.tm_year -= 80; /* 0~11 -> 1~12 */ tm.tm_mon++; /* 0~59 -> 0~29(2sec counts) */ tm.tm_sec >>= 1; *time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec); *date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday); if (time_cs) *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000; } EXPORT_SYMBOL_GPL(fat_time_unix2fat); static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts) { return (struct timespec64){ ts.tv_sec & ~1ULL, 0 }; } static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts) { if (ts.tv_nsec) ts.tv_nsec -= ts.tv_nsec % 10000000UL; return ts; } /* * truncate the various times with appropriate granularity: * root inode: * all times always 0 * all other inodes: * mtime - 2 seconds * ctime * msdos - 2 seconds * vfat - 10 milliseconds * atime - 24 hours (00:00:00 in local timezone) */ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); struct timespec64 ts; if (inode->i_ino == MSDOS_ROOT_INO) return 0; if (now == NULL) { now = &ts; ts = current_time(inode); } if (flags & S_ATIME) { /* to localtime */ time64_t seconds = now->tv_sec - fat_tz_offset(sbi); s32 remainder; div_s64_rem(seconds, SECS_PER_DAY, &remainder); /* to day boundary, and back to unix time */ seconds = seconds + fat_tz_offset(sbi) - remainder; inode->i_atime = (struct timespec64){ seconds, 0 }; } if (flags & S_CTIME) { if (sbi->options.isvfat) inode->i_ctime = fat_timespec64_trunc_10ms(*now); else inode->i_ctime = fat_timespec64_trunc_2secs(*now); } if (flags & S_MTIME) inode->i_mtime = fat_timespec64_trunc_2secs(*now); return 0; } EXPORT_SYMBOL_GPL(fat_truncate_time); int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) { int dirty_flags = 0; if (inode->i_ino == MSDOS_ROOT_INO) return 0; if (flags & (S_ATIME | S_CTIME | S_MTIME)) { fat_truncate_time(inode, now, flags); if (inode->i_sb->s_flags & SB_LAZYTIME) dirty_flags |= I_DIRTY_TIME; else dirty_flags |= I_DIRTY_SYNC; } if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) dirty_flags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, dirty_flags); return 0; } EXPORT_SYMBOL_GPL(fat_update_time); int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) { int i, err = 0; for (i = 0; i < nr_bhs; i++) write_dirty_buffer(bhs[i], 0); for (i = 0; i < nr_bhs; i++) { wait_on_buffer(bhs[i]); if (!err && !buffer_uptodate(bhs[i])) err = -EIO; } return err; }
13 13 3128 3128 11 11 2315 2311 2312 2313 2312 23 2290 2331 23 2310 2334 2344 32 151 438 2304 2307 13 1989 751 2302 2305 2306 1 2302 2309 753 2288 2283 2284 2007 2012 2013 3 3 2304 2301 2308 2304 2303 2306 2306 13 13 3 2303 2304 2305 2303 2303 2305 2302 2457 119 119 187 2310 2456 186 2304 2457 3 2454 2396 2397 1613 2236 2 2234 2232 265 37 2228 2234 2 2233 2394 2396 2398 2400 2397 2396 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/dd.c - The core device/driver interactions. * * This file contains the (sometimes tricky) code that controls the * interactions between devices and drivers, which primarily includes * driver binding and unbinding. * * All of this code used to exist in drivers/base/bus.c, but was * relocated to here in the name of compartmentalization (since it wasn't * strictly code just for the 'struct bus_type'. * * Copyright (c) 2002-5 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007-2009 Novell Inc. */ #include <linux/debugfs.h> #include <linux/device.h> #include <linux/delay.h> #include <linux/dma-map-ops.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kthread.h> #include <linux/wait.h> #include <linux/async.h> #include <linux/pm_runtime.h> #include <linux/pinctrl/devinfo.h> #include <linux/slab.h> #include "base.h" #include "power/power.h" /* * Deferred Probe infrastructure. * * Sometimes driver probe order matters, but the kernel doesn't always have * dependency information which means some drivers will get probed before a * resource it depends on is available. For example, an SDHCI driver may * first need a GPIO line from an i2c GPIO controller before it can be * initialized. If a required resource is not available yet, a driver can * request probing to be deferred by returning -EPROBE_DEFER from its probe hook * * Deferred probe maintains two lists of devices, a pending list and an active * list. A driver returning -EPROBE_DEFER causes the device to be added to the * pending list. A successful driver probe will trigger moving all devices * from the pending to the active list so that the workqueue will eventually * retry them. * * The deferred_probe_mutex must be held any time the deferred_probe_*_list * of the (struct device*)->p->deferred_probe pointers are manipulated */ static DEFINE_MUTEX(deferred_probe_mutex); static LIST_HEAD(deferred_probe_pending_list); static LIST_HEAD(deferred_probe_active_list); static atomic_t deferred_trigger_count = ATOMIC_INIT(0); static bool initcalls_done; /* Save the async probe drivers' name from kernel cmdline */ #define ASYNC_DRV_NAMES_MAX_LEN 256 static char async_probe_drv_names[ASYNC_DRV_NAMES_MAX_LEN]; /* * In some cases, like suspend to RAM or hibernation, It might be reasonable * to prohibit probing of devices as it could be unsafe. * Once defer_all_probes is true all drivers probes will be forcibly deferred. */ static bool defer_all_probes; static void __device_set_deferred_probe_reason(const struct device *dev, char *reason) { kfree(dev->p->deferred_probe_reason); dev->p->deferred_probe_reason = reason; } /* * deferred_probe_work_func() - Retry probing devices in the active list. */ static void deferred_probe_work_func(struct work_struct *work) { struct device *dev; struct device_private *private; /* * This block processes every device in the deferred 'active' list. * Each device is removed from the active list and passed to * bus_probe_device() to re-attempt the probe. The loop continues * until every device in the active list is removed and retried. * * Note: Once the device is removed from the list and the mutex is * released, it is possible for the device get freed by another thread * and cause a illegal pointer dereference. This code uses * get/put_device() to ensure the device structure cannot disappear * from under our feet. */ mutex_lock(&deferred_probe_mutex); while (!list_empty(&deferred_probe_active_list)) { private = list_first_entry(&deferred_probe_active_list, typeof(*dev->p), deferred_probe); dev = private->device; list_del_init(&private->deferred_probe); get_device(dev); __device_set_deferred_probe_reason(dev, NULL); /* * Drop the mutex while probing each device; the probe path may * manipulate the deferred list */ mutex_unlock(&deferred_probe_mutex); /* * Force the device to the end of the dpm_list since * the PM code assumes that the order we add things to * the list is a good order for suspend but deferred * probe makes that very unsafe. */ device_pm_move_to_tail(dev); dev_dbg(dev, "Retrying from deferred list\n"); bus_probe_device(dev); mutex_lock(&deferred_probe_mutex); put_device(dev); } mutex_unlock(&deferred_probe_mutex); } static DECLARE_WORK(deferred_probe_work, deferred_probe_work_func); void driver_deferred_probe_add(struct device *dev) { if (!dev->can_match) return; mutex_lock(&deferred_probe_mutex); if (list_empty(&dev->p->deferred_probe)) { dev_dbg(dev, "Added to deferred list\n"); list_add_tail(&dev->p->deferred_probe, &deferred_probe_pending_list); } mutex_unlock(&deferred_probe_mutex); } void driver_deferred_probe_del(struct device *dev) { mutex_lock(&deferred_probe_mutex); if (!list_empty(&dev->p->deferred_probe)) { dev_dbg(dev, "Removed from deferred list\n"); list_del_init(&dev->p->deferred_probe); __device_set_deferred_probe_reason(dev, NULL); } mutex_unlock(&deferred_probe_mutex); } static bool driver_deferred_probe_enable = false; /** * driver_deferred_probe_trigger() - Kick off re-probing deferred devices * * This functions moves all devices from the pending list to the active * list and schedules the deferred probe workqueue to process them. It * should be called anytime a driver is successfully bound to a device. * * Note, there is a race condition in multi-threaded probe. In the case where * more than one device is probing at the same time, it is possible for one * probe to complete successfully while another is about to defer. If the second * depends on the first, then it will get put on the pending list after the * trigger event has already occurred and will be stuck there. * * The atomic 'deferred_trigger_count' is used to determine if a successful * trigger has occurred in the midst of probing a driver. If the trigger count * changes in the midst of a probe, then deferred processing should be triggered * again. */ static void driver_deferred_probe_trigger(void) { if (!driver_deferred_probe_enable) return; /* * A successful probe means that all the devices in the pending list * should be triggered to be reprobed. Move all the deferred devices * into the active list so they can be retried by the workqueue */ mutex_lock(&deferred_probe_mutex); atomic_inc(&deferred_trigger_count); list_splice_tail_init(&deferred_probe_pending_list, &deferred_probe_active_list); mutex_unlock(&deferred_probe_mutex); /* * Kick the re-probe thread. It may already be scheduled, but it is * safe to kick it again. */ queue_work(system_unbound_wq, &deferred_probe_work); } /** * device_block_probing() - Block/defer device's probes * * It will disable probing of devices and defer their probes instead. */ void device_block_probing(void) { defer_all_probes = true; /* sync with probes to avoid races. */ wait_for_device_probe(); } /** * device_unblock_probing() - Unblock/enable device's probes * * It will restore normal behavior and trigger re-probing of deferred * devices. */ void device_unblock_probing(void) { defer_all_probes = false; driver_deferred_probe_trigger(); } /** * device_set_deferred_probe_reason() - Set defer probe reason message for device * @dev: the pointer to the struct device * @vaf: the pointer to va_format structure with message */ void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf) { const char *drv = dev_driver_string(dev); char *reason; mutex_lock(&deferred_probe_mutex); reason = kasprintf(GFP_KERNEL, "%s: %pV", drv, vaf); __device_set_deferred_probe_reason(dev, reason); mutex_unlock(&deferred_probe_mutex); } /* * deferred_devs_show() - Show the devices in the deferred probe pending list. */ static int deferred_devs_show(struct seq_file *s, void *data) { struct device_private *curr; mutex_lock(&deferred_probe_mutex); list_for_each_entry(curr, &deferred_probe_pending_list, deferred_probe) seq_printf(s, "%s\t%s", dev_name(curr->device), curr->device->p->deferred_probe_reason ?: "\n"); mutex_unlock(&deferred_probe_mutex); return 0; } DEFINE_SHOW_ATTRIBUTE(deferred_devs); int driver_deferred_probe_timeout; EXPORT_SYMBOL_GPL(driver_deferred_probe_timeout); static int __init deferred_probe_timeout_setup(char *str) { int timeout; if (!kstrtoint(str, 10, &timeout)) driver_deferred_probe_timeout = timeout; return 1; } __setup("deferred_probe_timeout=", deferred_probe_timeout_setup); /** * driver_deferred_probe_check_state() - Check deferred probe state * @dev: device to check * * Return: * -ENODEV if initcalls have completed and modules are disabled. * -ETIMEDOUT if the deferred probe timeout was set and has expired * and modules are enabled. * -EPROBE_DEFER in other cases. * * Drivers or subsystems can opt-in to calling this function instead of directly * returning -EPROBE_DEFER. */ int driver_deferred_probe_check_state(struct device *dev) { if (!IS_ENABLED(CONFIG_MODULES) && initcalls_done) { dev_warn(dev, "ignoring dependency for device, assuming no driver\n"); return -ENODEV; } if (!driver_deferred_probe_timeout && initcalls_done) { dev_warn(dev, "deferred probe timeout, ignoring dependency\n"); return -ETIMEDOUT; } return -EPROBE_DEFER; } EXPORT_SYMBOL_GPL(driver_deferred_probe_check_state); static void deferred_probe_timeout_work_func(struct work_struct *work) { struct device_private *p; fw_devlink_drivers_done(); driver_deferred_probe_timeout = 0; driver_deferred_probe_trigger(); flush_work(&deferred_probe_work); mutex_lock(&deferred_probe_mutex); list_for_each_entry(p, &deferred_probe_pending_list, deferred_probe) dev_info(p->device, "deferred probe pending\n"); mutex_unlock(&deferred_probe_mutex); } static DECLARE_DELAYED_WORK(deferred_probe_timeout_work, deferred_probe_timeout_work_func); /** * deferred_probe_initcall() - Enable probing of deferred devices * * We don't want to get in the way when the bulk of drivers are getting probed. * Instead, this initcall makes sure that deferred probing is delayed until * late_initcall time. */ static int deferred_probe_initcall(void) { debugfs_create_file("devices_deferred", 0444, NULL, NULL, &deferred_devs_fops); driver_deferred_probe_enable = true; driver_deferred_probe_trigger(); /* Sort as many dependencies as possible before exiting initcalls */ flush_work(&deferred_probe_work); initcalls_done = true; if (!IS_ENABLED(CONFIG_MODULES)) fw_devlink_drivers_done(); /* * Trigger deferred probe again, this time we won't defer anything * that is optional */ driver_deferred_probe_trigger(); flush_work(&deferred_probe_work); if (driver_deferred_probe_timeout > 0) { schedule_delayed_work(&deferred_probe_timeout_work, driver_deferred_probe_timeout * HZ); } return 0; } late_initcall(deferred_probe_initcall); static void __exit deferred_probe_exit(void) { debugfs_lookup_and_remove("devices_deferred", NULL); } __exitcall(deferred_probe_exit); /** * device_is_bound() - Check if device is bound to a driver * @dev: device to check * * Returns true if passed device has already finished probing successfully * against a driver. * * This function must be called with the device lock held. */ bool device_is_bound(struct device *dev) { return dev->p && klist_node_attached(&dev->p->knode_driver); } static void driver_bound(struct device *dev) { if (device_is_bound(dev)) { pr_warn("%s: device %s already bound\n", __func__, kobject_name(&dev->kobj)); return; } pr_debug("driver: '%s': %s: bound to device '%s'\n", dev->driver->name, __func__, dev_name(dev)); klist_add_tail(&dev->p->knode_driver, &dev->driver->p->klist_devices); device_links_driver_bound(dev); device_pm_check_callbacks(dev); /* * Make sure the device is no longer in one of the deferred lists and * kick off retrying all pending devices */ driver_deferred_probe_del(dev); driver_deferred_probe_trigger(); if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_BOUND_DRIVER, dev); kobject_uevent(&dev->kobj, KOBJ_BIND); } static ssize_t coredump_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { device_lock(dev); dev->driver->coredump(dev); device_unlock(dev); return count; } static DEVICE_ATTR_WO(coredump); static int driver_sysfs_add(struct device *dev) { int ret; if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_BIND_DRIVER, dev); ret = sysfs_create_link(&dev->driver->p->kobj, &dev->kobj, kobject_name(&dev->kobj)); if (ret) goto fail; ret = sysfs_create_link(&dev->kobj, &dev->driver->p->kobj, "driver"); if (ret) goto rm_dev; if (!IS_ENABLED(CONFIG_DEV_COREDUMP) || !dev->driver->coredump) return 0; ret = device_create_file(dev, &dev_attr_coredump); if (!ret) return 0; sysfs_remove_link(&dev->kobj, "driver"); rm_dev: sysfs_remove_link(&dev->driver->p->kobj, kobject_name(&dev->kobj)); fail: return ret; } static void driver_sysfs_remove(struct device *dev) { struct device_driver *drv = dev->driver; if (drv) { if (drv->coredump) device_remove_file(dev, &dev_attr_coredump); sysfs_remove_link(&drv->p->kobj, kobject_name(&dev->kobj)); sysfs_remove_link(&dev->kobj, "driver"); } } /** * device_bind_driver - bind a driver to one device. * @dev: device. * * Allow manual attachment of a driver to a device. * Caller must have already set @dev->driver. * * Note that this does not modify the bus reference count. * Please verify that is accounted for before calling this. * (It is ok to call with no other effort from a driver's probe() method.) * * This function must be called with the device lock held. * * Callers should prefer to use device_driver_attach() instead. */ int device_bind_driver(struct device *dev) { int ret; ret = driver_sysfs_add(dev); if (!ret) { device_links_force_bind(dev); driver_bound(dev); } else if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_DRIVER_NOT_BOUND, dev); return ret; } EXPORT_SYMBOL_GPL(device_bind_driver); static atomic_t probe_count = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(probe_waitqueue); static ssize_t state_synced_show(struct device *dev, struct device_attribute *attr, char *buf) { bool val; device_lock(dev); val = dev->state_synced; device_unlock(dev); return sysfs_emit(buf, "%u\n", val); } static DEVICE_ATTR_RO(state_synced); static int call_driver_probe(struct device *dev, struct device_driver *drv) { int ret = 0; if (dev->bus->probe) ret = dev->bus->probe(dev); else if (drv->probe) ret = drv->probe(dev); switch (ret) { case 0: break; case -EPROBE_DEFER: /* Driver requested deferred probing */ dev_dbg(dev, "Driver %s requests probe deferral\n", drv->name); break; case -ENODEV: case -ENXIO: pr_debug("%s: probe of %s rejects match %d\n", drv->name, dev_name(dev), ret); break; default: /* driver matched but the probe failed */ pr_warn("%s: probe of %s failed with error %d\n", drv->name, dev_name(dev), ret); break; } return ret; } static int really_probe(struct device *dev, struct device_driver *drv) { bool test_remove = IS_ENABLED(CONFIG_DEBUG_TEST_DRIVER_REMOVE) && !drv->suppress_bind_attrs; int ret; if (defer_all_probes) { /* * Value of defer_all_probes can be set only by * device_block_probing() which, in turn, will call * wait_for_device_probe() right after that to avoid any races. */ dev_dbg(dev, "Driver %s force probe deferral\n", drv->name); return -EPROBE_DEFER; } ret = device_links_check_suppliers(dev); if (ret) return ret; pr_debug("bus: '%s': %s: probing driver %s with device %s\n", drv->bus->name, __func__, drv->name, dev_name(dev)); if (!list_empty(&dev->devres_head)) { dev_crit(dev, "Resources present before probing\n"); ret = -EBUSY; goto done; } re_probe: dev->driver = drv; /* If using pinctrl, bind pins now before probing */ ret = pinctrl_bind_pins(dev); if (ret) goto pinctrl_bind_failed; if (dev->bus->dma_configure) { ret = dev->bus->dma_configure(dev); if (ret) goto probe_failed; } ret = driver_sysfs_add(dev); if (ret) { pr_err("%s: driver_sysfs_add(%s) failed\n", __func__, dev_name(dev)); goto probe_failed; } if (dev->pm_domain && dev->pm_domain->activate) { ret = dev->pm_domain->activate(dev); if (ret) goto probe_failed; } ret = call_driver_probe(dev, drv); if (ret) { /* * Return probe errors as positive values so that the callers * can distinguish them from other errors. */ ret = -ret; goto probe_failed; } ret = device_add_groups(dev, drv->dev_groups); if (ret) { dev_err(dev, "device_add_groups() failed\n"); goto dev_groups_failed; } if (dev_has_sync_state(dev)) { ret = device_create_file(dev, &dev_attr_state_synced); if (ret) { dev_err(dev, "state_synced sysfs add failed\n"); goto dev_sysfs_state_synced_failed; } } if (test_remove) { test_remove = false; device_remove_file(dev, &dev_attr_state_synced); device_remove_groups(dev, drv->dev_groups); if (dev->bus->remove) dev->bus->remove(dev); else if (drv->remove) drv->remove(dev); devres_release_all(dev); arch_teardown_dma_ops(dev); kfree(dev->dma_range_map); dev->dma_range_map = NULL; driver_sysfs_remove(dev); dev->driver = NULL; dev_set_drvdata(dev, NULL); if (dev->pm_domain && dev->pm_domain->dismiss) dev->pm_domain->dismiss(dev); pm_runtime_reinit(dev); goto re_probe; } pinctrl_init_done(dev); if (dev->pm_domain && dev->pm_domain->sync) dev->pm_domain->sync(dev); driver_bound(dev); pr_debug("bus: '%s': %s: bound device %s to driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); goto done; dev_sysfs_state_synced_failed: device_remove_groups(dev, drv->dev_groups); dev_groups_failed: if (dev->bus->remove) dev->bus->remove(dev); else if (drv->remove) drv->remove(dev); probe_failed: if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_DRIVER_NOT_BOUND, dev); pinctrl_bind_failed: device_links_no_driver(dev); devres_release_all(dev); arch_teardown_dma_ops(dev); kfree(dev->dma_range_map); dev->dma_range_map = NULL; driver_sysfs_remove(dev); dev->driver = NULL; dev_set_drvdata(dev, NULL); if (dev->pm_domain && dev->pm_domain->dismiss) dev->pm_domain->dismiss(dev); pm_runtime_reinit(dev); dev_pm_set_driver_flags(dev, 0); done: return ret; } /* * For initcall_debug, show the driver probe time. */ static int really_probe_debug(struct device *dev, struct device_driver *drv) { ktime_t calltime, rettime; int ret; calltime = ktime_get(); ret = really_probe(dev, drv); rettime = ktime_get(); /* * Don't change this to pr_debug() because that requires * CONFIG_DYNAMIC_DEBUG and we want a simple 'initcall_debug' on the * kernel commandline to print this all the time at the debug level. */ printk(KERN_DEBUG "probe of %s returned %d after %lld usecs\n", dev_name(dev), ret, ktime_us_delta(rettime, calltime)); return ret; } /** * driver_probe_done * Determine if the probe sequence is finished or not. * * Should somehow figure out how to use a semaphore, not an atomic variable... */ int driver_probe_done(void) { int local_probe_count = atomic_read(&probe_count); pr_debug("%s: probe_count = %d\n", __func__, local_probe_count); if (local_probe_count) return -EBUSY; return 0; } /** * wait_for_device_probe * Wait for device probing to be completed. */ void wait_for_device_probe(void) { /* wait for the deferred probe workqueue to finish */ flush_work(&deferred_probe_work); /* wait for the known devices to complete their probing */ wait_event(probe_waitqueue, atomic_read(&probe_count) == 0); async_synchronize_full(); } EXPORT_SYMBOL_GPL(wait_for_device_probe); static int __driver_probe_device(struct device_driver *drv, struct device *dev) { int ret = 0; if (dev->p->dead || !device_is_registered(dev)) return -ENODEV; if (dev->driver) return -EBUSY; dev->can_match = true; pr_debug("bus: '%s': %s: matched device %s with driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); pm_runtime_get_suppliers(dev); if (dev->parent) pm_runtime_get_sync(dev->parent); pm_runtime_barrier(dev); if (initcall_debug) ret = really_probe_debug(dev, drv); else ret = really_probe(dev, drv); pm_request_idle(dev); if (dev->parent) pm_runtime_put(dev->parent); pm_runtime_put_suppliers(dev); return ret; } /** * driver_probe_device - attempt to bind device & driver together * @drv: driver to bind a device to * @dev: device to try to bind to the driver * * This function returns -ENODEV if the device is not registered, -EBUSY if it * already has a driver, 0 if the device is bound successfully and a positive * (inverted) error code for failures from the ->probe method. * * This function must be called with @dev lock held. When called for a * USB interface, @dev->parent lock must be held as well. * * If the device has a parent, runtime-resume the parent before driver probing. */ static int driver_probe_device(struct device_driver *drv, struct device *dev) { int trigger_count = atomic_read(&deferred_trigger_count); int ret; atomic_inc(&probe_count); ret = __driver_probe_device(drv, dev); if (ret == -EPROBE_DEFER || ret == EPROBE_DEFER) { driver_deferred_probe_add(dev); /* * Did a trigger occur while probing? Need to re-trigger if yes */ if (trigger_count != atomic_read(&deferred_trigger_count) && !defer_all_probes) driver_deferred_probe_trigger(); } atomic_dec(&probe_count); wake_up_all(&probe_waitqueue); return ret; } static inline bool cmdline_requested_async_probing(const char *drv_name) { return parse_option_str(async_probe_drv_names, drv_name); } /* The option format is "driver_async_probe=drv_name1,drv_name2,..." */ static int __init save_async_options(char *buf) { if (strlen(buf) >= ASYNC_DRV_NAMES_MAX_LEN) pr_warn("Too long list of driver names for 'driver_async_probe'!\n"); strlcpy(async_probe_drv_names, buf, ASYNC_DRV_NAMES_MAX_LEN); return 1; } __setup("driver_async_probe=", save_async_options); bool driver_allows_async_probing(struct device_driver *drv) { switch (drv->probe_type) { case PROBE_PREFER_ASYNCHRONOUS: return true; case PROBE_FORCE_SYNCHRONOUS: return false; default: if (cmdline_requested_async_probing(drv->name)) return true; if (module_requested_async_probing(drv->owner)) return true; return false; } } struct device_attach_data { struct device *dev; /* * Indicates whether we are are considering asynchronous probing or * not. Only initial binding after device or driver registration * (including deferral processing) may be done asynchronously, the * rest is always synchronous, as we expect it is being done by * request from userspace. */ bool check_async; /* * Indicates if we are binding synchronous or asynchronous drivers. * When asynchronous probing is enabled we'll execute 2 passes * over drivers: first pass doing synchronous probing and second * doing asynchronous probing (if synchronous did not succeed - * most likely because there was no driver requiring synchronous * probing - and we found asynchronous driver during first pass). * The 2 passes are done because we can't shoot asynchronous * probe for given device and driver from bus_for_each_drv() since * driver pointer is not guaranteed to stay valid once * bus_for_each_drv() iterates to the next driver on the bus. */ bool want_async; /* * We'll set have_async to 'true' if, while scanning for matching * driver, we'll encounter one that requests asynchronous probing. */ bool have_async; }; static int __device_attach_driver(struct device_driver *drv, void *_data) { struct device_attach_data *data = _data; struct device *dev = data->dev; bool async_allowed; int ret; ret = driver_match_device(drv, dev); if (ret == 0) { /* no match */ return 0; } else if (ret == -EPROBE_DEFER) { dev_dbg(dev, "Device match requests probe deferral\n"); dev->can_match = true; driver_deferred_probe_add(dev); /* * Device can't match with a driver right now, so don't attempt * to match or bind with other drivers on the bus. */ return ret; } else if (ret < 0) { dev_dbg(dev, "Bus failed to match device: %d\n", ret); return ret; } /* ret > 0 means positive match */ async_allowed = driver_allows_async_probing(drv); if (async_allowed) data->have_async = true; if (data->check_async && async_allowed != data->want_async) return 0; /* * Ignore errors returned by ->probe so that the next driver can try * its luck. */ ret = driver_probe_device(drv, dev); if (ret < 0) return ret; return ret == 0; } static void __device_attach_async_helper(void *_dev, async_cookie_t cookie) { struct device *dev = _dev; struct device_attach_data data = { .dev = dev, .check_async = true, .want_async = true, }; device_lock(dev); /* * Check if device has already been removed or claimed. This may * happen with driver loading, device discovery/registration, * and deferred probe processing happens all at once with * multiple threads. */ if (dev->p->dead || dev->driver) goto out_unlock; if (dev->parent) pm_runtime_get_sync(dev->parent); bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver); dev_dbg(dev, "async probe completed\n"); pm_request_idle(dev); if (dev->parent) pm_runtime_put(dev->parent); out_unlock: device_unlock(dev); put_device(dev); } static int __device_attach(struct device *dev, bool allow_async) { int ret = 0; bool async = false; device_lock(dev); if (dev->p->dead) { goto out_unlock; } else if (dev->driver) { if (device_is_bound(dev)) { ret = 1; goto out_unlock; } ret = device_bind_driver(dev); if (ret == 0) ret = 1; else { dev->driver = NULL; ret = 0; } } else { struct device_attach_data data = { .dev = dev, .check_async = allow_async, .want_async = false, }; if (dev->parent) pm_runtime_get_sync(dev->parent); ret = bus_for_each_drv(dev->bus, NULL, &data, __device_attach_driver); if (!ret && allow_async && data.have_async) { /* * If we could not find appropriate driver * synchronously and we are allowed to do * async probes and there are drivers that * want to probe asynchronously, we'll * try them. */ dev_dbg(dev, "scheduling asynchronous probe\n"); get_device(dev); async = true; } else { pm_request_idle(dev); } if (dev->parent) pm_runtime_put(dev->parent); } out_unlock: device_unlock(dev); if (async) async_schedule_dev(__device_attach_async_helper, dev); return ret; } /** * device_attach - try to attach device to a driver. * @dev: device. * * Walk the list of drivers that the bus has and call * driver_probe_device() for each pair. If a compatible * pair is found, break out and return. * * Returns 1 if the device was bound to a driver; * 0 if no matching driver was found; * -ENODEV if the device is not registered. * * When called for a USB interface, @dev->parent lock must be held. */ int device_attach(struct device *dev) { return __device_attach(dev, false); } EXPORT_SYMBOL_GPL(device_attach); void device_initial_probe(struct device *dev) { __device_attach(dev, true); } /* * __device_driver_lock - acquire locks needed to manipulate dev->drv * @dev: Device we will update driver info for * @parent: Parent device. Needed if the bus requires parent lock * * This function will take the required locks for manipulating dev->drv. * Normally this will just be the @dev lock, but when called for a USB * interface, @parent lock will be held as well. */ static void __device_driver_lock(struct device *dev, struct device *parent) { if (parent && dev->bus->need_parent_lock) device_lock(parent); device_lock(dev); } /* * __device_driver_unlock - release locks needed to manipulate dev->drv * @dev: Device we will update driver info for * @parent: Parent device. Needed if the bus requires parent lock * * This function will release the required locks for manipulating dev->drv. * Normally this will just be the the @dev lock, but when called for a * USB interface, @parent lock will be released as well. */ static void __device_driver_unlock(struct device *dev, struct device *parent) { device_unlock(dev); if (parent && dev->bus->need_parent_lock) device_unlock(parent); } /** * device_driver_attach - attach a specific driver to a specific device * @drv: Driver to attach * @dev: Device to attach it to * * Manually attach driver to a device. Will acquire both @dev lock and * @dev->parent lock if needed. Returns 0 on success, -ERR on failure. */ int device_driver_attach(struct device_driver *drv, struct device *dev) { int ret; __device_driver_lock(dev, dev->parent); ret = __driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); /* also return probe errors as normal negative errnos */ if (ret > 0) ret = -ret; if (ret == -EPROBE_DEFER) return -EAGAIN; return ret; } EXPORT_SYMBOL_GPL(device_driver_attach); static void __driver_attach_async_helper(void *_dev, async_cookie_t cookie) { struct device *dev = _dev; struct device_driver *drv; int ret; __device_driver_lock(dev, dev->parent); drv = dev->p->async_driver; ret = driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); dev_dbg(dev, "driver %s async attach completed: %d\n", drv->name, ret); put_device(dev); } static int __driver_attach(struct device *dev, void *data) { struct device_driver *drv = data; bool async = false; int ret; /* * Lock device and try to bind to it. We drop the error * here and always return 0, because we need to keep trying * to bind to devices and some drivers will return an error * simply if it didn't support the device. * * driver_probe_device() will spit a warning if there * is an error. */ ret = driver_match_device(drv, dev); if (ret == 0) { /* no match */ return 0; } else if (ret == -EPROBE_DEFER) { dev_dbg(dev, "Device match requests probe deferral\n"); dev->can_match = true; driver_deferred_probe_add(dev); /* * Driver could not match with device, but may match with * another device on the bus. */ return 0; } else if (ret < 0) { dev_dbg(dev, "Bus failed to match device: %d\n", ret); /* * Driver could not match with device, but may match with * another device on the bus. */ return 0; } /* ret > 0 means positive match */ if (driver_allows_async_probing(drv)) { /* * Instead of probing the device synchronously we will * probe it asynchronously to allow for more parallelism. * * We only take the device lock here in order to guarantee * that the dev->driver and async_driver fields are protected */ dev_dbg(dev, "probing driver %s asynchronously\n", drv->name); device_lock(dev); if (!dev->driver) { get_device(dev); dev->p->async_driver = drv; async = true; } device_unlock(dev); if (async) async_schedule_dev(__driver_attach_async_helper, dev); return 0; } __device_driver_lock(dev, dev->parent); driver_probe_device(drv, dev); __device_driver_unlock(dev, dev->parent); return 0; } /** * driver_attach - try to bind driver to devices. * @drv: driver. * * Walk the list of devices that the bus has on it and try to * match the driver with each one. If driver_probe_device() * returns 0 and the @dev->driver is set, we've found a * compatible pair. */ int driver_attach(struct device_driver *drv) { return bus_for_each_dev(drv->bus, NULL, drv, __driver_attach); } EXPORT_SYMBOL_GPL(driver_attach); /* * __device_release_driver() must be called with @dev lock held. * When called for a USB interface, @dev->parent lock must be held as well. */ static void __device_release_driver(struct device *dev, struct device *parent) { struct device_driver *drv; drv = dev->driver; if (drv) { pm_runtime_get_sync(dev); while (device_links_busy(dev)) { __device_driver_unlock(dev, parent); device_links_unbind_consumers(dev); __device_driver_lock(dev, parent); /* * A concurrent invocation of the same function might * have released the driver successfully while this one * was waiting, so check for that. */ if (dev->driver != drv) { pm_runtime_put(dev); return; } } driver_sysfs_remove(dev); if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_UNBIND_DRIVER, dev); pm_runtime_put_sync(dev); device_remove_file(dev, &dev_attr_state_synced); device_remove_groups(dev, drv->dev_groups); if (dev->bus && dev->bus->remove) dev->bus->remove(dev); else if (drv->remove) drv->remove(dev); devres_release_all(dev); arch_teardown_dma_ops(dev); kfree(dev->dma_range_map); dev->dma_range_map = NULL; dev->driver = NULL; dev_set_drvdata(dev, NULL); if (dev->pm_domain && dev->pm_domain->dismiss) dev->pm_domain->dismiss(dev); pm_runtime_reinit(dev); dev_pm_set_driver_flags(dev, 0); device_links_driver_cleanup(dev); klist_remove(&dev->p->knode_driver); device_pm_check_callbacks(dev); if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_UNBOUND_DRIVER, dev); kobject_uevent(&dev->kobj, KOBJ_UNBIND); } } void device_release_driver_internal(struct device *dev, struct device_driver *drv, struct device *parent) { __device_driver_lock(dev, parent); if (!drv || drv == dev->driver) __device_release_driver(dev, parent); __device_driver_unlock(dev, parent); } /** * device_release_driver - manually detach device from driver. * @dev: device. * * Manually detach device from driver. * When called for a USB interface, @dev->parent lock must be held. * * If this function is to be called with @dev->parent lock held, ensure that * the device's consumers are unbound in advance or that their locks can be * acquired under the @dev->parent lock. */ void device_release_driver(struct device *dev) { /* * If anyone calls device_release_driver() recursively from * within their ->remove callback for the same device, they * will deadlock right here. */ device_release_driver_internal(dev, NULL, NULL); } EXPORT_SYMBOL_GPL(device_release_driver); /** * device_driver_detach - detach driver from a specific device * @dev: device to detach driver from * * Detach driver from device. Will acquire both @dev lock and @dev->parent * lock if needed. */ void device_driver_detach(struct device *dev) { device_release_driver_internal(dev, NULL, dev->parent); } /** * driver_detach - detach driver from all devices it controls. * @drv: driver. */ void driver_detach(struct device_driver *drv) { struct device_private *dev_prv; struct device *dev; if (driver_allows_async_probing(drv)) async_synchronize_full(); for (;;) { spin_lock(&drv->p->klist_devices.k_lock); if (list_empty(&drv->p->klist_devices.k_list)) { spin_unlock(&drv->p->klist_devices.k_lock); break; } dev_prv = list_last_entry(&drv->p->klist_devices.k_list, struct device_private, knode_driver.n_node); dev = dev_prv->device; get_device(dev); spin_unlock(&drv->p->klist_devices.k_lock); device_release_driver_internal(dev, drv, dev->parent); put_device(dev); } }
4 3 176 176 11 10 1 9 7 3 3 1 2 2 3 2 5 13 5 1 10 2 2 2 2 2 2 4 4 1 4 2 2 5 6 2 5 4 1 2 5 5 4 2 2 1 3 1 1 1 1 1 1 1 1 1 4 1 1 3 4 4 1 3 4 4 1 1 1 1 1 1 2 2 1 1 2 1 3 4 6 6 4 1 1 2 1 1 1 10 1 9 2 1 6 5 5 7 1 7 5 1 1 3 1 1 1 4 4 1 3 3 3 1 1 1 3 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 3 1 1 1 1 1 2 8 1 1 6 1 2 1 3 4 1 1 2 5 1 1 2 1 2 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 1 2 1 2 1 1 4 1 1 2 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 5 1 2 1 1 2 1 5 5 5 5 14 13 1 13 13 14 8 5 8 1 1 8 1 2 5 4 4 4 3 3 2 1 2 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 2 1 2 3 1 2 1 1 6 1 4 1 106 1 1 1 2 5 3 2 2 1 2 1 4 1 1 7 4 4 2 5 1 2 1 1 3 1 2 3 2 4 1 1 13 1 1 2 5 1 1 8 4 3 2 4 3 1 1 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/kernel.h> #include <linux/bio.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> #include <linux/compat.h> #include <linux/security.h> #include <linux/xattr.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/uuid.h> #include <linux/btrfs.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/fileattr.h> #include <linux/fsverity.h> #include "ctree.h" #include "disk-io.h" #include "export.h" #include "transaction.h" #include "btrfs_inode.h" #include "print-tree.h" #include "volumes.h" #include "locking.h" #include "backref.h" #include "rcu-string.h" #include "send.h" #include "dev-replace.h" #include "props.h" #include "sysfs.h" #include "qgroup.h" #include "tree-log.h" #include "compression.h" #include "space-info.h" #include "delalloc-space.h" #include "block-group.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI * structures are incorrect, as the timespec structure from userspace * is 4 bytes too small. We define these alternatives here to teach * the kernel about the 32-bit struct packing. */ struct btrfs_ioctl_timespec_32 { __u64 sec; __u32 nsec; } __attribute__ ((__packed__)); struct btrfs_ioctl_received_subvol_args_32 { char uuid[BTRFS_UUID_SIZE]; /* in */ __u64 stransid; /* in */ __u64 rtransid; /* out */ struct btrfs_ioctl_timespec_32 stime; /* in */ struct btrfs_ioctl_timespec_32 rtime; /* out */ __u64 flags; /* in */ __u64 reserved[16]; /* in */ } __attribute__ ((__packed__)); #define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ struct btrfs_ioctl_received_subvol_args_32) #endif #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_send_args_32 { __s64 send_fd; /* in */ __u64 clone_sources_count; /* in */ compat_uptr_t clone_sources; /* in */ __u64 parent_root; /* in */ __u64 flags; /* in */ __u64 reserved[4]; /* in */ } __attribute__ ((__packed__)); #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ struct btrfs_ioctl_send_args_32) #endif /* Mask out flags that are inappropriate for the given type of inode. */ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, unsigned int flags) { if (S_ISDIR(inode->i_mode)) return flags; else if (S_ISREG(inode->i_mode)) return flags & ~FS_DIRSYNC_FL; else return flags & (FS_NODUMP_FL | FS_NOATIME_FL); } /* * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) { unsigned int iflags = 0; u32 flags = binode->flags; u32 ro_flags = binode->ro_flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; if (flags & BTRFS_INODE_IMMUTABLE) iflags |= FS_IMMUTABLE_FL; if (flags & BTRFS_INODE_APPEND) iflags |= FS_APPEND_FL; if (flags & BTRFS_INODE_NODUMP) iflags |= FS_NODUMP_FL; if (flags & BTRFS_INODE_NOATIME) iflags |= FS_NOATIME_FL; if (flags & BTRFS_INODE_DIRSYNC) iflags |= FS_DIRSYNC_FL; if (flags & BTRFS_INODE_NODATACOW) iflags |= FS_NOCOW_FL; if (ro_flags & BTRFS_INODE_RO_VERITY) iflags |= FS_VERITY_FL; if (flags & BTRFS_INODE_NOCOMPRESS) iflags |= FS_NOCOMP_FL; else if (flags & BTRFS_INODE_COMPRESS) iflags |= FS_COMPR_FL; return iflags; } /* * Update inode->i_flags based on the btrfs internal flags. */ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) { struct btrfs_inode *binode = BTRFS_I(inode); unsigned int new_fl = 0; if (binode->flags & BTRFS_INODE_SYNC) new_fl |= S_SYNC; if (binode->flags & BTRFS_INODE_IMMUTABLE) new_fl |= S_IMMUTABLE; if (binode->flags & BTRFS_INODE_APPEND) new_fl |= S_APPEND; if (binode->flags & BTRFS_INODE_NOATIME) new_fl |= S_NOATIME; if (binode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; if (binode->ro_flags & BTRFS_INODE_RO_VERITY) new_fl |= S_VERITY; set_mask_bits(&inode->i_flags, S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | S_VERITY, new_fl); } /* * Check if @flags are a supported and valid set of FS_*_FL flags and that * the old and new flags are not conflicting */ static int check_fsflags(unsigned int old_flags, unsigned int flags) { if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ FS_SYNC_FL | FS_DIRSYNC_FL | \ FS_NOCOMP_FL | FS_COMPR_FL | FS_NOCOW_FL)) return -EOPNOTSUPP; /* COMPR and NOCOMP on new/old are valid */ if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) return -EINVAL; if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL)) return -EINVAL; /* NOCOW and compression options are mutually exclusive */ if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL))) return -EINVAL; if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL))) return -EINVAL; return 0; } static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, unsigned int flags) { if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) return -EPERM; return 0; } /* * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. */ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); return 0; } int btrfs_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; u32 binode_flags; if (btrfs_root_readonly(root)) return -EROFS; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); old_fsflags = btrfs_inode_flags_to_fsflags(binode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; ret = check_fsflags_compatible(fs_info, fsflags); if (ret) return ret; binode_flags = binode->flags; if (fsflags & FS_SYNC_FL) binode_flags |= BTRFS_INODE_SYNC; else binode_flags &= ~BTRFS_INODE_SYNC; if (fsflags & FS_IMMUTABLE_FL) binode_flags |= BTRFS_INODE_IMMUTABLE; else binode_flags &= ~BTRFS_INODE_IMMUTABLE; if (fsflags & FS_APPEND_FL) binode_flags |= BTRFS_INODE_APPEND; else binode_flags &= ~BTRFS_INODE_APPEND; if (fsflags & FS_NODUMP_FL) binode_flags |= BTRFS_INODE_NODUMP; else binode_flags &= ~BTRFS_INODE_NODUMP; if (fsflags & FS_NOATIME_FL) binode_flags |= BTRFS_INODE_NOATIME; else binode_flags &= ~BTRFS_INODE_NOATIME; /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ if (!fa->flags_valid) { /* 1 item for the inode */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); goto update_flags; } if (fsflags & FS_DIRSYNC_FL) binode_flags |= BTRFS_INODE_DIRSYNC; else binode_flags &= ~BTRFS_INODE_DIRSYNC; if (fsflags & FS_NOCOW_FL) { if (S_ISREG(inode->i_mode)) { /* * It's safe to turn csums off here, no extents exist. * Otherwise we want the flag to reflect the real COW * status of the file and will not set it. */ if (inode->i_size == 0) binode_flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } else { binode_flags |= BTRFS_INODE_NODATACOW; } } else { /* * Revert back under same assumptions as above */ if (S_ISREG(inode->i_mode)) { if (inode->i_size == 0) binode_flags &= ~(BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM); } else { binode_flags &= ~BTRFS_INODE_NODATACOW; } } /* * The COMPRESS flag can only be changed by users, while the NOCOMPRESS * flag may be changed automatically if compression code won't make * things smaller. */ if (fsflags & FS_NOCOMP_FL) { binode_flags &= ~BTRFS_INODE_COMPRESS; binode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { if (IS_SWAPFILE(inode)) return -ETXTBSY; binode_flags |= BTRFS_INODE_COMPRESS; binode_flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); } else { binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } /* * 1 for inode item * 2 for properties */ trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) return PTR_ERR(trans); if (comp) { ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } update_flags: binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out_end_trans: btrfs_end_transaction(trans); return ret; } /* * Start exclusive operation @type, return true on success */ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { bool ret = false; spin_lock(&fs_info->super_lock); if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { fs_info->exclusive_operation = type; ret = true; } spin_unlock(&fs_info->super_lock); return ret; } /* * Conditionally allow to enter the exclusive operation in case it's compatible * with the running one. This must be paired with btrfs_exclop_start_unlock and * btrfs_exclop_finish. * * Compatibility: * - the same type is already running * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller * must check the condition first that would allow none -> @type */ bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { spin_lock(&fs_info->super_lock); if (fs_info->exclusive_operation == type) return true; spin_unlock(&fs_info->super_lock); return false; } void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) { spin_unlock(&fs_info->super_lock); } void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) { spin_lock(&fs_info->super_lock); WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); spin_unlock(&fs_info->super_lock); sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { struct inode *inode = file_inode(file); return put_user(inode->i_generation, arg); } static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_device *device; struct request_queue *q; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* * btrfs_trim_block_group() depends on space cache, which is not * available in zoned filesystem. So, disallow fitrim on a zoned * filesystem for now. */ if (btrfs_is_zoned(fs_info)) return -EOPNOTSUPP; /* * If the fs is mounted with nologreplay, which requires it to be * mounted in RO mode as well, we can not allow discard on free space * inside block groups, because log trees refer to extents that are not * pinned in a block group's free space cache (pinning the extents is * precisely the first phase of replaying a log tree). */ if (btrfs_test_opt(fs_info, NOLOGREPLAY)) return -EROFS; rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { if (!device->bdev) continue; q = bdev_get_queue(device->bdev); if (blk_queue_discard(q)) { num_devices++; minlen = min_t(u64, q->limits.discard_granularity, minlen); } } rcu_read_unlock(); if (!num_devices) return -EOPNOTSUPP; if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; /* * NOTE: Don't truncate the range using super->total_bytes. Bytenr of * block group is in the logical address space, which can be any * sectorsize aligned bytenr in the range [0, U64_MAX]. */ if (range.len < fs_info->sb->s_blocksize) return -EINVAL; range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); if (ret < 0) return ret; if (copy_to_user(arg, &range, sizeof(range))) return -EFAULT; return 0; } int __pure btrfs_is_empty_uuid(u8 *uuid) { int i; for (i = 0; i < BTRFS_UUID_SIZE; i++) { if (uuid[i]) return 0; } return 1; } static noinline int create_subvol(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *name, int namelen, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item *root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; struct timespec64 cur_time = current_time(dir); struct inode *inode; int ret; int err; dev_t anon_dev = 0; u64 objectid; u64 index = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) return -ENOMEM; ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) goto fail_free; ret = get_anon_bdev(&anon_dev); if (ret < 0) goto fail_free; /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assumes subvolume qgroup's level to be 0. */ if (btrfs_qgroup_level(objectid)) { ret = -ENOSPC; goto fail_free; } btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * The same as the snapshot creation, please see the comment * of create_snapshot(). */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); if (ret) goto fail_free; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv); goto fail_free; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); if (ret) goto fail; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; } btrfs_mark_buffer_dirty(leaf); inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, fs_info->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_flags(root_item, 0); btrfs_set_root_limit(root_item, 0); btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); btrfs_set_root_bytenr(root_item, leaf->start); btrfs_set_root_generation(root_item, trans->transid); btrfs_set_root_level(root_item, 0); btrfs_set_root_refs(root_item, 1); btrfs_set_root_used(root_item, leaf->len); btrfs_set_root_last_snapshot(root_item, 0); btrfs_set_root_generation_v2(root_item, btrfs_root_generation(root_item)); generate_random_guid(root_item->uuid); btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); root_item->ctime = root_item->otime; btrfs_set_root_ctransid(root_item, trans->transid); btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) { /* * Since we don't abort the transaction in this case, free the * tree block so that we don't leak space and leave the * filesystem in an inconsistent state (an extent item in the * extent tree with a backreference for a root that does not * exists). */ btrfs_tree_lock(leaf); btrfs_clean_tree_block(leaf); btrfs_tree_unlock(leaf); btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); goto fail; } free_extent_buffer(leaf); leaf = NULL; key.offset = (u64)-1; new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); if (IS_ERR(new_root)) { free_anon_bdev(anon_dev); ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); goto fail; } /* Freeing will be done in btrfs_put_root() of new_root */ anon_dev = 0; ret = btrfs_record_root_in_trans(trans, new_root); if (ret) { btrfs_put_root(new_root); btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns); btrfs_put_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ btrfs_abort_transaction(trans, ret); goto fail; } /* * insert the directory item */ ret = btrfs_set_inode_index(BTRFS_I(dir), &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, btrfs_ino(BTRFS_I(dir)), index, name, namelen); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) btrfs_abort_transaction(trans, ret); fail: kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; if (!ret) { inode = btrfs_lookup_dentry(dir, dentry); if (IS_ERR(inode)) return PTR_ERR(inode); d_instantiate(dentry, inode); } return ret; fail_free: if (anon_dev) free_anon_bdev(anon_dev); kfree(root_item); return ret; } static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, bool readonly, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; int ret; if (btrfs_root_refs(&root->root_item) == 0) return -ENOENT; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; if (atomic_read(&root->nr_swapfiles)) { btrfs_warn(fs_info, "cannot snapshot subvolume with active swapfile"); return -ETXTBSY; } pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL); if (!pending_snapshot) return -ENOMEM; ret = get_anon_bdev(&pending_snapshot->anon_dev); if (ret < 0) goto free_pending; pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), GFP_KERNEL); pending_snapshot->path = btrfs_alloc_path(); if (!pending_snapshot->root_item || !pending_snapshot->path) { ret = -ENOMEM; goto free_pending; } btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * 1 - parent dir inode * 2 - dir entries * 1 - root item * 2 - root ref/backref * 1 - root of snapshot * 1 - UUID item */ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, 8, false); if (ret) goto free_pending; pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; pending_snapshot->dir = dir; pending_snapshot->inherit = inherit; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } trans->pending_snapshot = pending_snapshot; ret = btrfs_commit_transaction(trans); if (ret) goto fail; ret = pending_snapshot->error; if (ret) goto fail; ret = btrfs_orphan_cleanup(pending_snapshot->snap); if (ret) goto fail; inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; } d_instantiate(dentry, inode); ret = 0; pending_snapshot->anon_dev = 0; fail: /* Prevent double freeing of anon_dev */ if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); kfree(pending_snapshot); return ret; } /* copy of may_delete in fs/namei.c() * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. * 1. We can't do it if dir is read-only (done in permission()) * 2. We should have write and exec permissions on dir * 3. We can't remove anything from append-only dir * 4. We can't do anything with immutable dir (done in permission()) * 5. If the sticky bit on dir is set we should either * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do anything with * links pointing to it. * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. * 9. We can't remove a root or mountpoint. * 10. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int btrfs_may_delete(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *victim, int isdir) { int error; if (d_really_is_negative(victim)) return -ENOENT; BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(mnt_userns, dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; if (isdir) { if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; if (victim->d_flags & DCACHE_NFSFS_RENAMED) return -EBUSY; return 0; } /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) return -EOVERFLOW; return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); } /* * Create a new subvolume below @parent. This is largely modeled after * sys_mkdirat and vfs_mkdir, but we only do a single component lookup * inside this filesystem so it's quite a bit simpler. */ static noinline int btrfs_mksubvol(const struct path *parent, struct user_namespace *mnt_userns, const char *name, int namelen, struct btrfs_root *snap_src, bool readonly, struct btrfs_qgroup_inherit *inherit) { struct inode *dir = d_inode(parent->dentry); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct dentry *dentry; int error; error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (error == -EINTR) return error; dentry = lookup_one(mnt_userns, name, parent->dentry, namelen); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; error = btrfs_may_create(mnt_userns, dir, dentry); if (error) goto out_dput; /* * even if this name doesn't exist, we may get hash collisions. * check for them now when we can safely fail */ error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, name, namelen); if (error) goto out_dput; down_read(&fs_info->subvol_sem); if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) goto out_up_read; if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit); if (!error) fsnotify_mkdir(dir, dentry); out_up_read: up_read(&fs_info->subvol_sem); out_dput: dput(dentry); out_unlock: btrfs_inode_unlock(dir, 0); return error; } static noinline int btrfs_mksnapshot(const struct path *parent, struct user_namespace *mnt_userns, const char *name, int namelen, struct btrfs_root *root, bool readonly, struct btrfs_qgroup_inherit *inherit) { int ret; bool snapshot_force_cow = false; /* * Force new buffered writes to reserve space even when NOCOW is * possible. This is to avoid later writeback (running dealloc) to * fallback to COW mode and unexpectedly fail with ENOSPC. */ btrfs_drew_read_lock(&root->snapshot_lock); ret = btrfs_start_delalloc_snapshot(root, false); if (ret) goto out; /* * All previous writes have started writeback in NOCOW mode, so now * we force future writes to fallback to COW mode during snapshot * creation. */ atomic_inc(&root->snapshot_force_cow); snapshot_force_cow = true; btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); ret = btrfs_mksubvol(parent, mnt_userns, name, namelen, root, readonly, inherit); out: if (snapshot_force_cow) atomic_dec(&root->snapshot_force_cow); btrfs_drew_read_unlock(&root->snapshot_lock); return ret; } /* * When we're defragging a range, we don't want to kick it off again * if it is really just waiting for delalloc to send it down. * If we find a nice big extent or delalloc range for the bytes in the * file you want to defrag, we return 0 to let you know to skip this * part of the file */ static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 end; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE); read_unlock(&em_tree->lock); if (em) { end = extent_map_end(em); free_extent_map(em); if (end - offset > thresh) return 0; } /* if we already have a nice delalloc here, just stop */ thresh /= 2; end = count_range_bits(io_tree, &offset, offset + thresh, thresh, EXTENT_DELALLOC, 1); if (end >= thresh) return 0; return 1; } /* * helper function to walk through a file and find extents * newer than a specific transid, and smaller than thresh. * * This is used by the defragging code to find new and small * extents */ static int find_new_extents(struct btrfs_root *root, struct inode *inode, u64 newer_than, u64 *off, u32 thresh) { struct btrfs_path *path; struct btrfs_key min_key; struct extent_buffer *leaf; struct btrfs_file_extent_item *extent; int type; int ret; u64 ino = btrfs_ino(BTRFS_I(inode)); path = btrfs_alloc_path(); if (!path) return -ENOMEM; min_key.objectid = ino; min_key.type = BTRFS_EXTENT_DATA_KEY; min_key.offset = *off; while (1) { ret = btrfs_search_forward(root, &min_key, path, newer_than); if (ret != 0) goto none; process_slot: if (min_key.objectid != ino) goto none; if (min_key.type != BTRFS_EXTENT_DATA_KEY) goto none; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(leaf, extent); if (type == BTRFS_FILE_EXTENT_REG && btrfs_file_extent_num_bytes(leaf, extent) < thresh && check_defrag_in_cache(inode, min_key.offset, thresh)) { *off = min_key.offset; btrfs_free_path(path); return 0; } path->slots[0]++; if (path->slots[0] < btrfs_header_nritems(leaf)) { btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); goto process_slot; } if (min_key.offset == (u64)-1) goto none; min_key.offset++; btrfs_release_path(path); } none: btrfs_free_path(path); return -ENOENT; } static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em; u64 len = PAGE_SIZE; /* * hopefully we have this extent in the tree already, try without * the full extent lock */ read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); if (!em) { struct extent_state *cached = NULL; u64 end = start + len - 1; /* get the big lock and read metadata off disk */ lock_extent_bits(io_tree, start, end, &cached); em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); unlock_extent_cached(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; } return em; } static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) { struct extent_map *next; bool ret = true; /* this is the last extent */ if (em->start + em->len >= i_size_read(inode)) return false; next = defrag_lookup_extent(inode, em->start + em->len); if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) ret = false; else if ((em->block_start + em->block_len == next->block_start) && (em->block_len > SZ_128K && next->block_len > SZ_128K)) ret = false; free_extent_map(next); return ret; } static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, u64 *last_len, u64 *skip, u64 *defrag_end, int compress) { struct extent_map *em; int ret = 1; bool next_mergeable = true; bool prev_mergeable = true; /* * make sure that once we start defragging an extent, we keep on * defragging it */ if (start < *defrag_end) return 1; *skip = 0; em = defrag_lookup_extent(inode, start); if (!em) return 0; /* this will cover holes, and inline extents */ if (em->block_start >= EXTENT_MAP_LAST_BYTE) { ret = 0; goto out; } if (!*defrag_end) prev_mergeable = false; next_mergeable = defrag_check_next_extent(inode, em); /* * we hit a real extent, if it is big or the next extent is not a * real extent, don't bother defragging it */ if (!compress && (*last_len == 0 || *last_len >= thresh) && (em->len >= thresh || (!next_mergeable && !prev_mergeable))) ret = 0; out: /* * last_len ends up being a counter of how many bytes we've defragged. * every time we choose not to defrag an extent, we reset *last_len * so that the next tiny extent will force a defrag. * * The end result of this is that tiny extents before a single big * extent will force at least part of that big extent to be defragged. */ if (ret) { *defrag_end = extent_map_end(em); } else { *last_len = 0; *skip = extent_map_end(em); *defrag_end = 0; } free_extent_map(em); return ret; } /* * it doesn't do much good to defrag one or two pages * at a time. This pulls in a nice chunk of pages * to COW and defrag. * * It also makes sure the delalloc code has enough * dirty data to avoid making new small extents as part * of the defrag * * It's a good idea to start RA on this range * before calling this. */ static int cluster_pages_for_defrag(struct inode *inode, struct page **pages, unsigned long start_index, unsigned long num_pages) { unsigned long file_end; u64 isize = i_size_read(inode); u64 page_start; u64 page_end; u64 page_cnt; u64 start = (u64)start_index << PAGE_SHIFT; u64 search_start; int ret; int i; int i_done; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_io_tree *tree; struct extent_changeset *data_reserved = NULL; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); file_end = (isize - 1) >> PAGE_SHIFT; if (!isize || start_index > file_end) return 0; page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, start, page_cnt << PAGE_SHIFT); if (ret) return ret; i_done = 0; tree = &BTRFS_I(inode)->io_tree; /* step one, lock all the pages */ for (i = 0; i < page_cnt; i++) { struct page *page; again: page = find_or_create_page(inode->i_mapping, start_index + i, mask); if (!page) break; ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); put_page(page); break; } page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; while (1) { lock_extent_bits(tree, page_start, page_end, &cached_state); ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), page_start); unlock_extent_cached(tree, page_start, page_end, &cached_state); if (!ordered) break; unlock_page(page); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); lock_page(page); /* * we unlocked the page above, so we need check if * it was released or not. */ if (page->mapping != inode->i_mapping) { unlock_page(page); put_page(page); goto again; } } if (!PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); if (!PageUptodate(page)) { unlock_page(page); put_page(page); ret = -EIO; break; } } if (page->mapping != inode->i_mapping) { unlock_page(page); put_page(page); goto again; } pages[i] = page; i_done++; } if (!i_done || ret) goto out; if (!(inode->i_sb->s_flags & SB_ACTIVE)) goto out; /* * so now we have a nice long stream of locked * and up to date pages, lets wait on them */ for (i = 0; i < i_done; i++) wait_on_page_writeback(pages[i]); page_start = page_offset(pages[0]); page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); /* * When defragmenting we skip ranges that have holes or inline extents, * (check should_defrag_range()), to avoid unnecessary IO and wasting * space. At btrfs_defrag_file(), we check if a range should be defragged * before locking the inode and then, if it should, we trigger a sync * page cache readahead - we lock the inode only after that to avoid * blocking for too long other tasks that possibly want to operate on * other file ranges. But before we were able to get the inode lock, * some other task may have punched a hole in the range, or we may have * now an inline extent, in which case we should not defrag. So check * for that here, where we have the inode and the range locked, and bail * out if that happened. */ search_start = page_start; while (search_start < page_end) { struct extent_map *em; em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start, page_end - search_start); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_unlock_range; } if (em->block_start >= EXTENT_MAP_LAST_BYTE) { free_extent_map(em); /* Ok, 0 means we did not defrag anything */ ret = 0; goto out_unlock_range; } search_start = extent_map_end(em); free_extent_map(em); } clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); if (i_done != page_cnt) { spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, start, (page_cnt - i_done) << PAGE_SHIFT, true); } set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); for (i = 0; i < i_done; i++) { clear_page_dirty_for_io(pages[i]); ClearPageChecked(pages[i]); set_page_dirty(pages[i]); unlock_page(pages[i]); put_page(pages[i]); } btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; out_unlock_range: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state); out: for (i = 0; i < i_done; i++) { unlock_page(pages[i]); put_page(pages[i]); } btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, start, page_cnt << PAGE_SHIFT, true); btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; } int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct file_ra_state *ra = NULL; unsigned long last_index; u64 isize = i_size_read(inode); u64 last_len = 0; u64 skip = 0; u64 defrag_end = 0; u64 newer_off = range->start; unsigned long i; unsigned long ra_index = 0; int ret; int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; u32 extent_thresh = range->extent_thresh; unsigned long max_cluster = SZ_256K >> PAGE_SHIFT; unsigned long cluster = max_cluster; u64 new_align = ~((u64)SZ_128K - 1); struct page **pages = NULL; bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; if (isize == 0) return 0; if (range->start >= isize) return -EINVAL; if (do_compress) { if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) return -EINVAL; if (range->compress_type) compress_type = range->compress_type; } if (extent_thresh == 0) extent_thresh = SZ_256K; /* * If we were not given a file, allocate a readahead context. As * readahead is just an optimization, defrag will work without it so * we don't error out. */ if (!file) { ra = kzalloc(sizeof(*ra), GFP_KERNEL); if (ra) file_ra_state_init(ra, inode->i_mapping); } else { ra = &file->f_ra; } pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out_ra; } /* find the last page to defrag */ if (range->start + range->len > range->start) { last_index = min_t(u64, isize - 1, range->start + range->len - 1) >> PAGE_SHIFT; } else { last_index = (isize - 1) >> PAGE_SHIFT; } if (newer_than) { ret = find_new_extents(root, inode, newer_than, &newer_off, SZ_64K); if (!ret) { range->start = newer_off; /* * we always align our defrag to help keep * the extents in the file evenly spaced */ i = (newer_off & new_align) >> PAGE_SHIFT; } else goto out_ra; } else { i = range->start >> PAGE_SHIFT; } if (!max_to_defrag) max_to_defrag = last_index - i + 1; /* * make writeback starts from i, so the defrag range can be * written sequentially. */ if (i < inode->i_mapping->writeback_index) inode->i_mapping->writeback_index = i; while (i <= last_index && defrag_count < max_to_defrag && (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) { /* * make sure we stop running if someone unmounts * the FS */ if (!(inode->i_sb->s_flags & SB_ACTIVE)) break; if (btrfs_defrag_cancelled(fs_info)) { btrfs_debug(fs_info, "defrag_file cancelled"); ret = -EAGAIN; goto error; } if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, extent_thresh, &last_len, &skip, &defrag_end, do_compress)){ unsigned long next; /* * the should_defrag function tells us how much to skip * bump our counter by the suggested amount */ next = DIV_ROUND_UP(skip, PAGE_SIZE); i = max(i + 1, next); continue; } if (!newer_than) { cluster = (PAGE_ALIGN(defrag_end) >> PAGE_SHIFT) - i; cluster = min(cluster, max_cluster); } else { cluster = max_cluster; } if (i + cluster > ra_index) { ra_index = max(i, ra_index); if (ra) page_cache_sync_readahead(inode->i_mapping, ra, file, ra_index, cluster); ra_index += cluster; } btrfs_inode_lock(inode, 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; } else { if (do_compress) BTRFS_I(inode)->defrag_compress = compress_type; ret = cluster_pages_for_defrag(inode, pages, i, cluster); } if (ret < 0) { btrfs_inode_unlock(inode, 0); goto out_ra; } defrag_count += ret; balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_inode_unlock(inode, 0); if (newer_than) { if (newer_off == (u64)-1) break; if (ret > 0) i += ret; newer_off = max(newer_off + 1, (u64)i << PAGE_SHIFT); ret = find_new_extents(root, inode, newer_than, &newer_off, SZ_64K); if (!ret) { range->start = newer_off; i = (newer_off & new_align) >> PAGE_SHIFT; } else { break; } } else { if (ret > 0) { i += ret; last_len += ret << PAGE_SHIFT; } else { i++; last_len = 0; } } } ret = defrag_count; error: if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { filemap_flush(inode->i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); } if (range->compress_type == BTRFS_COMPRESS_LZO) { btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } out_ra: if (do_compress) { btrfs_inode_lock(inode, 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; btrfs_inode_unlock(inode, 0); } if (!file) kfree(ra); kfree(pages); return ret; } /* * Try to start exclusive operation @type or cancel it if it's running. * * Return: * 0 - normal mode, newly claimed op started * >0 - normal mode, something else is running, * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space * ECANCELED - cancel mode, successful cancel * ENOTCONN - cancel mode, operation not running anymore */ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type, bool cancel) { if (!cancel) { /* Start normal op */ if (!btrfs_exclop_start(fs_info, type)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; /* Exclusive operation is now claimed */ return 0; } /* Cancel running op */ if (btrfs_exclop_start_try_lock(fs_info, type)) { /* * This blocks any exclop finish from setting it to NONE, so we * request cancellation. Either it runs and we will wait for it, * or it has finished and no waiting will happen. */ atomic_inc(&fs_info->reloc_cancel_req); btrfs_exclop_start_unlock(fs_info); if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING, TASK_INTERRUPTIBLE); return -ECANCELED; } /* Something else is running or none */ return -ENOTCONN; } static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size; u64 old_size; u64 devid = 1; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; bool cancel; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; /* * Read the arguments before checking exclusivity to be able to * distinguish regular resize and cancel */ vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); goto out_drop; } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; sizestr = vol_args->name; cancel = (strcmp("cancel", sizestr) == 0); ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); if (ret) goto out_free; /* Exclusive operation is now claimed */ devstr = strchr(sizestr, ':'); if (devstr) { sizestr = devstr + 1; *devstr = '\0'; devstr = vol_args->name; ret = kstrtoull(devstr, 10, &devid); if (ret) goto out_finish; if (!devid) { ret = -EINVAL; goto out_finish; } btrfs_info(fs_info, "resizing devid %llu", devid); } args.devid = devid; device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) { btrfs_info(fs_info, "resizer unable to find device %llu", devid); ret = -ENODEV; goto out_finish; } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_info(fs_info, "resizer unable to apply on readonly device %llu", devid); ret = -EPERM; goto out_finish; } if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; else { if (sizestr[0] == '-') { mod = -1; sizestr++; } else if (sizestr[0] == '+') { mod = 1; sizestr++; } new_size = memparse(sizestr, &retptr); if (*retptr != '\0' || new_size == 0) { ret = -EINVAL; goto out_finish; } } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -EPERM; goto out_finish; } old_size = btrfs_device_get_total_bytes(device); if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; goto out_finish; } new_size = old_size - new_size; } else if (mod > 0) { if (new_size > ULLONG_MAX - old_size) { ret = -ERANGE; goto out_finish; } new_size = old_size + new_size; } if (new_size < SZ_256M) { ret = -EINVAL; goto out_finish; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; goto out_finish; } new_size = round_down(new_size, fs_info->sectorsize); if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_finish; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans); } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); } /* equal, nothing need to do */ if (ret == 0 && new_size != old_size) btrfs_info_in_rcu(fs_info, "resize device %s (devid %llu) from %llu to %llu", rcu_str_deref(device->name), device->devid, old_size, new_size); out_finish: btrfs_exclop_finish(fs_info); out_free: kfree(vol_args); out_drop: mnt_drop_write_file(file); return ret; } static noinline int __btrfs_ioctl_snap_create(struct file *file, struct user_namespace *mnt_userns, const char *name, unsigned long fd, int subvol, bool readonly, struct btrfs_qgroup_inherit *inherit) { int namelen; int ret = 0; if (!S_ISDIR(file_inode(file)->i_mode)) return -ENOTDIR; ret = mnt_want_write_file(file); if (ret) goto out; namelen = strlen(name); if (strchr(name, '/')) { ret = -EINVAL; goto out_drop_write; } if (name[0] == '.' && (namelen == 1 || (name[1] == '.' && namelen == 2))) { ret = -EEXIST; goto out_drop_write; } if (subvol) { ret = btrfs_mksubvol(&file->f_path, mnt_userns, name, namelen, NULL, readonly, inherit); } else { struct fd src = fdget(fd); struct inode *src_inode; if (!src.file) { ret = -EINVAL; goto out_drop_write; } src_inode = file_inode(src.file); if (src_inode->i_sb != file_inode(file)->i_sb) { btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; } else if (!inode_owner_or_capable(mnt_userns, src_inode)) { /* * Subvolume creation is not restricted, but snapshots * are limited to own subvolumes only */ ret = -EPERM; } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) { /* * Snapshots must be made with the src_inode referring * to the subvolume inode, otherwise the permission * checking above is useless because we may have * permission on a lower directory but not the subvol * itself. */ ret = -EINVAL; } else { ret = btrfs_mksnapshot(&file->f_path, mnt_userns, name, namelen, BTRFS_I(src_inode)->root, readonly, inherit); } fdput(src); } out_drop_write: mnt_drop_write_file(file); out: return ret; } static noinline int btrfs_ioctl_snap_create(struct file *file, void __user *arg, int subvol) { struct btrfs_ioctl_vol_args *vol_args; int ret; if (!S_ISDIR(file_inode(file)->i_mode)) return -ENOTDIR; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), vol_args->name, vol_args->fd, subvol, false, NULL); kfree(vol_args); return ret; } static noinline int btrfs_ioctl_snap_create_v2(struct file *file, void __user *arg, int subvol) { struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; if (!S_ISDIR(file_inode(file)->i_mode)) return -ENOTDIR; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; goto free_args; } if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { u64 nums; if (vol_args->size < sizeof(*inherit) || vol_args->size > PAGE_SIZE) { ret = -EINVAL; goto free_args; } inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); if (IS_ERR(inherit)) { ret = PTR_ERR(inherit); goto free_args; } if (inherit->num_qgroups > PAGE_SIZE || inherit->num_ref_copies > PAGE_SIZE || inherit->num_excl_copies > PAGE_SIZE) { ret = -EINVAL; goto free_inherit; } nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2 * inherit->num_excl_copies; if (vol_args->size != struct_size(inherit, qgroups, nums)) { ret = -EINVAL; goto free_inherit; } } ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), vol_args->name, vol_args->fd, subvol, readonly, inherit); if (ret) goto free_inherit; free_inherit: kfree(inherit); free_args: kfree(vol_args); return ret; } static noinline int btrfs_ioctl_subvol_getflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; u64 flags = 0; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; down_read(&fs_info->subvol_sem); if (btrfs_root_readonly(root)) flags |= BTRFS_SUBVOL_RDONLY; up_read(&fs_info->subvol_sem); if (copy_to_user(arg, &flags, sizeof(flags))) ret = -EFAULT; return ret; } static noinline int btrfs_ioctl_subvol_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 root_flags; u64 flags; int ret = 0; if (!inode_owner_or_capable(file_mnt_user_ns(file), inode)) return -EPERM; ret = mnt_want_write_file(file); if (ret) goto out; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; goto out_drop_write; } if (copy_from_user(&flags, arg, sizeof(flags))) { ret = -EFAULT; goto out_drop_write; } if (flags & ~BTRFS_SUBVOL_RDONLY) { ret = -EOPNOTSUPP; goto out_drop_write; } down_write(&fs_info->subvol_sem); /* nothing to do */ if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) goto out_drop_sem; root_flags = btrfs_root_flags(&root->root_item); if (flags & BTRFS_SUBVOL_RDONLY) { btrfs_set_root_flags(&root->root_item, root_flags | BTRFS_ROOT_SUBVOL_RDONLY); } else { /* * Block RO -> RW transition if this subvolume is involved in * send */ spin_lock(&root->root_item_lock); if (root->send_in_progress == 0) { btrfs_set_root_flags(&root->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); spin_unlock(&root->root_item_lock); } else { spin_unlock(&root->root_item_lock); btrfs_warn(fs_info, "Attempt to set subvolume %llu read-write during send", root->root_key.objectid); ret = -EPERM; goto out_drop_sem; } } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_reset; } ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (ret < 0) { btrfs_end_transaction(trans); goto out_reset; } ret = btrfs_commit_transaction(trans); out_reset: if (ret) btrfs_set_root_flags(&root->root_item, root_flags); out_drop_sem: up_write(&fs_info->subvol_sem); out_drop_write: mnt_drop_write_file(file); out: return ret; } static noinline int key_in_sk(struct btrfs_key *key, struct btrfs_ioctl_search_key *sk) { struct btrfs_key test; int ret; test.objectid = sk->min_objectid; test.type = sk->min_type; test.offset = sk->min_offset; ret = btrfs_comp_cpu_keys(key, &test); if (ret < 0) return 0; test.objectid = sk->max_objectid; test.type = sk->max_type; test.offset = sk->max_offset; ret = btrfs_comp_cpu_keys(key, &test); if (ret > 0) return 0; return 1; } static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, int *num_found) { u64 found_transid; struct extent_buffer *leaf; struct btrfs_ioctl_search_header sh; struct btrfs_key test; unsigned long item_off; unsigned long item_len; int nritems; int i; int slot; int ret = 0; leaf = path->nodes[0]; slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); if (btrfs_header_generation(leaf) > sk->max_transid) { i = nritems; goto advance_key; } found_transid = btrfs_header_generation(leaf); for (i = slot; i < nritems; i++) { item_off = btrfs_item_ptr_offset(leaf, i); item_len = btrfs_item_size_nr(leaf, i); btrfs_item_key_to_cpu(leaf, key, i); if (!key_in_sk(key, sk)) continue; if (sizeof(sh) + item_len > *buf_size) { if (*num_found) { ret = 1; goto out; } /* * return one empty item back for v1, which does not * handle -EOVERFLOW */ *buf_size = sizeof(sh) + item_len; item_len = 0; ret = -EOVERFLOW; } if (sizeof(sh) + item_len + *sk_offset > *buf_size) { ret = 1; goto out; } sh.objectid = key->objectid; sh.offset = key->offset; sh.type = key->type; sh.len = item_len; sh.transid = found_transid; /* * Copy search result header. If we fault then loop again so we * can fault in the pages and -EFAULT there if there's a * problem. Otherwise we'll fault and then copy the buffer in * properly this next time through */ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) { ret = 0; goto out; } *sk_offset += sizeof(sh); if (item_len) { char __user *up = ubuf + *sk_offset; /* * Copy the item, same behavior as above, but reset the * * sk_offset so we copy the full thing again. */ if (read_extent_buffer_to_user_nofault(leaf, up, item_off, item_len)) { ret = 0; *sk_offset -= sizeof(sh); goto out; } *sk_offset += item_len; } (*num_found)++; if (ret) /* -EOVERFLOW from above */ goto out; if (*num_found >= sk->nr_items) { ret = 1; goto out; } } advance_key: ret = 0; test.objectid = sk->max_objectid; test.type = sk->max_type; test.offset = sk->max_offset; if (btrfs_comp_cpu_keys(key, &test) >= 0) ret = 1; else if (key->offset < (u64)-1) key->offset++; else if (key->type < (u8)-1) { key->offset = 0; key->type++; } else if (key->objectid < (u64)-1) { key->offset = 0; key->type = 0; key->objectid++; } else ret = 1; out: /* * 0: all items from this leaf copied, continue with next * 1: * more items can be copied, but unused buffer is too small * * all items were found * Either way, it will stops the loop which iterates to the next * leaf * -EOVERFLOW: item was to large for buffer * -EFAULT: could not copy extent buffer back to userspace */ return ret; } static noinline int search_ioctl(struct inode *inode, struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf) { struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; int ret; int num_found = 0; unsigned long sk_offset = 0; if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { *buf_size = sizeof(struct btrfs_ioctl_search_header); return -EOVERFLOW; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (sk->tree_id == 0) { /* search the root of the inode that was passed */ root = btrfs_grab_root(BTRFS_I(inode)->root); } else { root = btrfs_get_fs_root(info, sk->tree_id, true); if (IS_ERR(root)) { btrfs_free_path(path); return PTR_ERR(root); } } key.objectid = sk->min_objectid; key.type = sk->min_type; key.offset = sk->min_offset; while (1) { ret = -EFAULT; if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset)) break; ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { if (ret > 0) ret = 0; goto err; } ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); if (ret) break; } if (ret > 0) ret = 0; err: sk->nr_items = num_found; btrfs_put_root(root); btrfs_free_path(path); return ret; } static noinline int btrfs_ioctl_tree_search(struct file *file, void __user *argp) { struct btrfs_ioctl_search_args __user *uargs; struct btrfs_ioctl_search_key sk; struct inode *inode; int ret; u64 buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; uargs = (struct btrfs_ioctl_search_args __user *)argp; if (copy_from_user(&sk, &uargs->key, sizeof(sk))) return -EFAULT; buf_size = sizeof(uargs->buf); inode = file_inode(file); ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); /* * In the origin implementation an overflow is handled by returning a * search header with a len of zero, so reset ret. */ if (ret == -EOVERFLOW) ret = 0; if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) ret = -EFAULT; return ret; } static noinline int btrfs_ioctl_tree_search_v2(struct file *file, void __user *argp) { struct btrfs_ioctl_search_args_v2 __user *uarg; struct btrfs_ioctl_search_args_v2 args; struct inode *inode; int ret; u64 buf_size; const u64 buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* copy search header and buffer size */ uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; if (copy_from_user(&args, uarg, sizeof(args))) return -EFAULT; buf_size = args.buf_size; /* limit result size to 16MB */ if (buf_size > buf_limit) buf_size = buf_limit; inode = file_inode(file); ret = search_ioctl(inode, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; else if (ret == -EOVERFLOW && copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) ret = -EFAULT; return ret; } /* * Search INODE_REFs to identify path name of 'dirid' directory * in a 'tree_id' tree. and sets path name to 'name'. */ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, u64 tree_id, u64 dirid, char *name) { struct btrfs_root *root; struct btrfs_key key; char *ptr; int ret = -1; int slot; int len; int total_len = 0; struct btrfs_inode_ref *iref; struct extent_buffer *l; struct btrfs_path *path; if (dirid == BTRFS_FIRST_FREE_OBJECTID) { name[0]='\0'; return 0; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1]; root = btrfs_get_fs_root(info, tree_id, true); if (IS_ERR(root)) { ret = PTR_ERR(root); root = NULL; goto out; } key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out; else if (ret > 0) { ret = -ENOENT; goto out; } l = path->nodes[0]; slot = path->slots[0]; iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(l, iref); ptr -= len + 1; total_len += len + 1; if (ptr < name) { ret = -ENAMETOOLONG; goto out; } *(ptr + len) = '/'; read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); if (key.offset == BTRFS_FIRST_FREE_OBJECTID) break; btrfs_release_path(path); key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; } memmove(name, ptr, total_len); name[total_len] = '\0'; ret = 0; out: btrfs_put_root(root); btrfs_free_path(path); return ret; } static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, struct inode *inode, struct btrfs_ioctl_ino_lookup_user_args *args) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct super_block *sb = inode->i_sb; struct btrfs_key upper_limit = BTRFS_I(inode)->location; u64 treeid = BTRFS_I(inode)->root->root_key.objectid; u64 dirid = args->dirid; unsigned long item_off; unsigned long item_len; struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct btrfs_root *root = NULL; struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; struct inode *temp_inode; char *ptr; int slot; int len; int total_len = 0; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * If the bottom subvolume does not exist directly under upper_limit, * construct the path in from the bottom up. */ if (dirid != upper_limit.objectid) { ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; root = btrfs_get_fs_root(fs_info, treeid, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; } key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out_put; else if (ret > 0) { ret = -ENOENT; goto out_put; } leaf = path->nodes[0]; slot = path->slots[0]; iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(leaf, iref); ptr -= len + 1; total_len += len + 1; if (ptr < args->path) { ret = -ENAMETOOLONG; goto out_put; } *(ptr + len) = '/'; read_extent_buffer(leaf, ptr, (unsigned long)(iref + 1), len); /* Check the read+exec permission of this directory */ ret = btrfs_previous_item(root, path, dirid, BTRFS_INODE_ITEM_KEY); if (ret < 0) { goto out_put; } else if (ret > 0) { ret = -ENOENT; goto out_put; } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key2, slot); if (key2.objectid != dirid) { ret = -ENOENT; goto out_put; } /* * We don't need the path anymore, so release it and * avoid deadlocks and lockdep warnings in case * btrfs_iget() needs to lookup the inode from its root * btree and lock the same leaf. */ btrfs_release_path(path); temp_inode = btrfs_iget(sb, key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out_put; } ret = inode_permission(mnt_userns, temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) goto out_put; if (key.offset == upper_limit.objectid) break; if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { ret = -EACCES; goto out_put; } key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; } memmove(args->path, ptr, total_len); args->path[total_len] = '\0'; btrfs_put_root(root); root = NULL; btrfs_release_path(path); } /* Get the bottom subvolume's name from ROOT_REF */ key.objectid = treeid; key.type = BTRFS_ROOT_REF_KEY; key.offset = args->treeid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (ret > 0) { ret = -ENOENT; goto out; } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); item_off = btrfs_item_ptr_offset(leaf, slot); item_len = btrfs_item_size_nr(leaf, slot); /* Check if dirid in ROOT_REF corresponds to passed dirid */ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { ret = -EINVAL; goto out; } /* Copy subvolume's name */ item_off += sizeof(struct btrfs_root_ref); item_len -= sizeof(struct btrfs_root_ref); read_extent_buffer(leaf, args->name, item_off, item_len); args->name[item_len] = 0; out_put: btrfs_put_root(root); out: btrfs_free_path(path); return ret; } static noinline int btrfs_ioctl_ino_lookup(struct file *file, void __user *argp) { struct btrfs_ioctl_ino_lookup_args *args; struct inode *inode; int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) return PTR_ERR(args); inode = file_inode(file); /* * Unprivileged query to obtain the containing subvolume root id. The * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) args->treeid = BTRFS_I(inode)->root->root_key.objectid; if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; goto out; } if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out; } ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, args->treeid, args->objectid, args->name); out: if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; kfree(args); return ret; } /* * Version of ino_lookup ioctl (unprivileged) * * The main differences from ino_lookup ioctl are: * * 1. Read + Exec permission will be checked using inode_permission() during * path construction. -EACCES will be returned in case of failure. * 2. Path construction will be stopped at the inode number which corresponds * to the fd with which this ioctl is called. If constructed path does not * exist under fd's inode, -EACCES will be returned. * 3. The name of bottom subvolume is also searched and filled. */ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) { struct btrfs_ioctl_ino_lookup_user_args *args; struct inode *inode; int ret; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) return PTR_ERR(args); inode = file_inode(file); if (args->dirid == BTRFS_FIRST_FREE_OBJECTID && BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) { /* * The subvolume does not exist under fd with which this is * called */ kfree(args); return -EACCES; } ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args); if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; kfree(args); return ret; } /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) { struct btrfs_ioctl_get_subvol_info_args *subvol_info; struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_root_item *root_item; struct btrfs_root_ref *rref; struct extent_buffer *leaf; unsigned long item_off; unsigned long item_len; struct inode *inode; int slot; int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL); if (!subvol_info) { btrfs_free_path(path); return -ENOMEM; } inode = file_inode(file); fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ key.objectid = BTRFS_I(inode)->root->root_key.objectid; root = btrfs_get_fs_root(fs_info, key.objectid, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; } root_item = &root->root_item; subvol_info->treeid = key.objectid; subvol_info->generation = btrfs_root_generation(root_item); subvol_info->flags = btrfs_root_flags(root_item); memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE); memcpy(subvol_info->parent_uuid, root_item->parent_uuid, BTRFS_UUID_SIZE); memcpy(subvol_info->received_uuid, root_item->received_uuid, BTRFS_UUID_SIZE); subvol_info->ctransid = btrfs_root_ctransid(root_item); subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime); subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime); subvol_info->otransid = btrfs_root_otransid(root_item); subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime); subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime); subvol_info->stransid = btrfs_root_stransid(root_item); subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime); subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime); subvol_info->rtransid = btrfs_root_rtransid(root_item); subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime); subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime); if (key.objectid != BTRFS_FS_TREE_OBJECTID) { /* Search root tree for ROOT_BACKREF of this subvolume */ key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(fs_info->tree_root, path); if (ret < 0) { goto out; } else if (ret > 0) { ret = -EUCLEAN; goto out; } } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid == subvol_info->treeid && key.type == BTRFS_ROOT_BACKREF_KEY) { subvol_info->parent_id = key.offset; rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref); item_off = btrfs_item_ptr_offset(leaf, slot) + sizeof(struct btrfs_root_ref); item_len = btrfs_item_size_nr(leaf, slot) - sizeof(struct btrfs_root_ref); read_extent_buffer(leaf, subvol_info->name, item_off, item_len); } else { ret = -ENOENT; goto out; } } btrfs_free_path(path); path = NULL; if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) ret = -EFAULT; out: btrfs_put_root(root); out_free: btrfs_free_path(path); kfree(subvol_info); return ret; } /* * Return ROOT_REF information of the subvolume containing this inode * except the subvolume name. */ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp) { struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; struct btrfs_root_ref *rref; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; struct inode *inode; u64 objectid; int slot; int ret; u8 found; path = btrfs_alloc_path(); if (!path) return -ENOMEM; rootrefs = memdup_user(argp, sizeof(*rootrefs)); if (IS_ERR(rootrefs)) { btrfs_free_path(path); return PTR_ERR(rootrefs); } inode = file_inode(file); root = BTRFS_I(inode)->root->fs_info->tree_root; objectid = BTRFS_I(inode)->root->root_key.objectid; key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; found = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) { goto out; } else if (ret > 0) { ret = -EUCLEAN; goto out; } } while (1) { leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) { ret = 0; goto out; } if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) { ret = -EOVERFLOW; goto out; } rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); rootrefs->rootref[found].treeid = key.offset; rootrefs->rootref[found].dirid = btrfs_root_ref_dirid(leaf, rref); found++; ret = btrfs_next_item(root, path); if (ret < 0) { goto out; } else if (ret > 0) { ret = -EUCLEAN; goto out; } } out: btrfs_free_path(path); if (!ret || ret == -EOVERFLOW) { rootrefs->num_items = found; /* update min_treeid for next search */ if (found) rootrefs->min_treeid = rootrefs->rootref[found - 1].treeid + 1; if (copy_to_user(argp, rootrefs, sizeof(*rootrefs))) ret = -EFAULT; } kfree(rootrefs); return ret; } static noinline int btrfs_ioctl_snap_destroy(struct file *file, void __user *arg, bool destroy_v2) { struct dentry *parent = file->f_path.dentry; struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); struct dentry *dentry; struct inode *dir = d_inode(parent); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args = NULL; struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; struct user_namespace *mnt_userns = file_mnt_user_ns(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; int err = 0; bool destroy_parent = false; if (destroy_v2) { vol_args2 = memdup_user(arg, sizeof(*vol_args2)); if (IS_ERR(vol_args2)) return PTR_ERR(vol_args2); if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { err = -EOPNOTSUPP; goto out; } /* * If SPEC_BY_ID is not set, we are looking for the subvolume by * name, same as v1 currently does. */ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0; subvol_name = vol_args2->name; err = mnt_want_write_file(file); if (err) goto out; } else { struct inode *old_dir; if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out; } err = mnt_want_write_file(file); if (err) goto out; dentry = btrfs_get_dentry(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, vol_args2->subvolid, 0, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_drop_write; } /* * Change the default parent since the subvolume being * deleted can be outside of the current mount point. */ parent = btrfs_get_parent(dentry); /* * At this point dentry->d_name can point to '/' if the * subvolume we want to destroy is outsite of the * current mount point, so we need to release the * current dentry and execute the lookup to return a new * one with ->d_name pointing to the * <mount point>/subvol_name. */ dput(dentry); if (IS_ERR(parent)) { err = PTR_ERR(parent); goto out_drop_write; } old_dir = dir; dir = d_inode(parent); /* * If v2 was used with SPEC_BY_ID, a new parent was * allocated since the subvolume can be outside of the * current mount point. Later on we need to release this * new parent dentry. */ destroy_parent = true; /* * On idmapped mounts, deletion via subvolid is * restricted to subvolumes that are immediate * ancestors of the inode referenced by the file * descriptor in the ioctl. Otherwise the idmapping * could potentially be abused to delete subvolumes * anywhere in the filesystem the user wouldn't be able * to delete without an idmapped mount. */ if (old_dir != dir && mnt_userns != &init_user_ns) { err = -EOPNOTSUPP; goto free_parent; } subvol_name_ptr = btrfs_get_subvol_name_from_objectid( fs_info, vol_args2->subvolid); if (IS_ERR(subvol_name_ptr)) { err = PTR_ERR(subvol_name_ptr); goto free_parent; } /* subvol_name_ptr is already nul terminated */ subvol_name = (char *)kbasename(subvol_name_ptr); } } else { vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = 0; subvol_name = vol_args->name; err = mnt_want_write_file(file); if (err) goto out; } subvol_namelen = strlen(subvol_name); if (strchr(subvol_name, '/') || strncmp(subvol_name, "..", subvol_namelen) == 0) { err = -EINVAL; goto free_subvol_name; } if (!S_ISDIR(dir->i_mode)) { err = -ENOTDIR; goto free_subvol_name; } err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (err == -EINTR) goto free_subvol_name; dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock_dir; } if (d_really_is_negative(dentry)) { err = -ENOENT; goto out_dput; } inode = d_inode(dentry); dest = BTRFS_I(inode)->root; if (!capable(CAP_SYS_ADMIN)) { /* * Regular user. Only allow this with a special mount * option, when the user has write+exec access to the * subvol root, and when rmdir(2) would have been * allowed. * * Note that this is _not_ check that the subvol is * empty or doesn't contain data that we wouldn't * otherwise be able to delete. * * Users who want to delete empty subvols should try * rmdir(2). */ err = -EPERM; if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; /* * Do not allow deletion if the parent dir is the same * as the dir to be deleted. That means the ioctl * must be called on the dentry referencing the root * of the subvol, not a random directory contained * within it. */ err = -EINVAL; if (root == dest) goto out_dput; err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; } /* check if subvolume may be deleted by a user */ err = btrfs_may_delete(mnt_userns, dir, dentry, 1); if (err) goto out_dput; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; } btrfs_inode_lock(inode, 0); err = btrfs_delete_subvolume(dir, dentry); btrfs_inode_unlock(inode, 0); if (!err) d_delete_notify(dir, dentry); out_dput: dput(dentry); out_unlock_dir: btrfs_inode_unlock(dir, 0); free_subvol_name: kfree(subvol_name_ptr); free_parent: if (destroy_parent) dput(parent); out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args2); kfree(vol_args); return err; } static int btrfs_ioctl_defrag(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_defrag_range_args range = {0}; int ret; ret = mnt_want_write_file(file); if (ret) return ret; if (btrfs_root_readonly(root)) { ret = -EROFS; goto out; } /* Subpage defrag will be supported in later commits */ if (root->fs_info->sectorsize < PAGE_SIZE) { ret = -ENOTTY; goto out; } switch (inode->i_mode & S_IFMT) { case S_IFDIR: if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out; } ret = btrfs_defrag_root(root); break; case S_IFREG: /* * Note that this does not check the file descriptor for write * access. This prevents defragmenting executables that are * running and allows defrag on files open in read-only mode. */ if (!capable(CAP_SYS_ADMIN) && inode_permission(&init_user_ns, inode, MAY_WRITE)) { ret = -EPERM; goto out; } if (argp) { if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; goto out; } if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { ret = -EOPNOTSUPP; goto out; } /* compression requires us to start the IO */ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; range.extent_thresh = (u32)-1; } } else { /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } ret = btrfs_defrag_file(file_inode(file), file, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; break; default: ret = -EINVAL; } out: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_vol_args *vol_args; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); goto out; } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(fs_info, vol_args->name); if (!ret) btrfs_info(fs_info, "disk added %s", vol_args->name); kfree(vol_args); out: btrfs_exclop_finish(fs_info); return ret; } static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; struct block_device *bdev = NULL; fmode_t mode; int ret; bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) { ret = -EOPNOTSUPP; goto out; } vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { args.devid = vol_args->devid; } else if (!strcmp("cancel", vol_args->name)) { cancel = true; } else { ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); if (ret) goto out; } ret = mnt_want_write_file(file); if (ret) goto out; ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret) goto err_drop; /* Exclusive operation is now claimed */ ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); btrfs_exclop_finish(fs_info); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) btrfs_info(fs_info, "device deleted: id %llu", vol_args->devid); else btrfs_info(fs_info, "device deleted: %s", vol_args->name); } err_drop: mnt_drop_write_file(file); if (bdev) blkdev_put(bdev, mode); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); return ret; } static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; struct block_device *bdev = NULL; fmode_t mode; int ret; bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; if (!strcmp("cancel", vol_args->name)) { cancel = true; } else { ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); if (ret) goto out; } ret = mnt_want_write_file(file); if (ret) goto out; ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); if (bdev) blkdev_put(bdev, mode); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); return ret; } static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 flags_in; int ret = 0; fi_args = memdup_user(arg, sizeof(*fi_args)); if (IS_ERR(fi_args)) return PTR_ERR(fi_args); flags_in = fi_args->flags; memset(fi_args, 0, sizeof(*fi_args)); rcu_read_lock(); fi_args->num_devices = fs_devices->num_devices; list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; } rcu_read_unlock(); memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid)); fi_args->nodesize = fs_info->nodesize; fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) { fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy); fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy); fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; } if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { fi_args->generation = fs_info->generation; fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; } if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) { memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid, sizeof(fi_args->metadata_uuid)); fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID; } if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; kfree(fi_args); return ret; } static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_ioctl_dev_info_args *di_args; struct btrfs_device *dev; int ret = 0; di_args = memdup_user(arg, sizeof(*di_args)); if (IS_ERR(di_args)) return PTR_ERR(di_args); args.devid = di_args->devid; if (!btrfs_is_empty_uuid(di_args->uuid)) args.uuid = di_args->uuid; rcu_read_lock(); dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev) { ret = -ENODEV; goto out; } di_args->devid = dev->devid; di_args->bytes_used = btrfs_device_get_bytes_used(dev); di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); if (dev->name) strscpy(di_args->path, rcu_str_deref(dev->name), sizeof(di_args->path)); else di_args->path[0] = '\0'; out: rcu_read_unlock(); if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) ret = -EFAULT; kfree(di_args); return ret; } static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *new_root; struct btrfs_dir_item *di; struct btrfs_trans_handle *trans; struct btrfs_path *path = NULL; struct btrfs_disk_key disk_key; u64 objectid = 0; u64 dir_id; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; if (copy_from_user(&objectid, argp, sizeof(objectid))) { ret = -EFAULT; goto out; } if (!objectid) objectid = BTRFS_FS_TREE_OBJECTID; new_root = btrfs_get_fs_root(fs_info, objectid, true); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); goto out; } if (!is_fstree(new_root->root_key.objectid)) { ret = -ENOENT; goto out_free; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out_free; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_free; } dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { btrfs_release_path(path); btrfs_end_transaction(trans); btrfs_err(fs_info, "Umm, you don't have the default diritem, this isn't going to work"); ret = -ENOENT; goto out_free; } btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans); out_free: btrfs_put_root(new_root); btrfs_free_path(path); out: mnt_drop_write_file(file); return ret; } static void get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space) { struct btrfs_block_group *block_group; space->total_bytes = 0; space->used_bytes = 0; space->flags = 0; list_for_each_entry(block_group, groups_list, list) { space->flags = block_group->flags; space->total_bytes += block_group->length; space->used_bytes += block_group->used; } } static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; static const u64 types[] = { BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA, BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA }; int num_types = 4; int alloc_size; int ret = 0; u64 slot_count = 0; int i, c; if (copy_from_user(&space_args, (struct btrfs_ioctl_space_args __user *)arg, sizeof(space_args))) return -EFAULT; for (i = 0; i < num_types; i++) { struct btrfs_space_info *tmp; info = NULL; list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } if (!info) continue; down_read(&info->groups_sem); for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { if (!list_empty(&info->block_groups[c])) slot_count++; } up_read(&info->groups_sem); } /* * Global block reserve, exported as a space_info */ slot_count++; /* space_slots == 0 means they are asking for a count */ if (space_args.space_slots == 0) { space_args.total_spaces = slot_count; goto out; } slot_count = min_t(u64, space_args.space_slots, slot_count); alloc_size = sizeof(*dest) * slot_count; /* we generally have at most 6 or so space infos, one for each raid * level. So, a whole page should be more than enough for everyone */ if (alloc_size > PAGE_SIZE) return -ENOMEM; space_args.total_spaces = 0; dest = kmalloc(alloc_size, GFP_KERNEL); if (!dest) return -ENOMEM; dest_orig = dest; /* now we have a buffer to copy into */ for (i = 0; i < num_types; i++) { struct btrfs_space_info *tmp; if (!slot_count) break; info = NULL; list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } if (!info) continue; down_read(&info->groups_sem); for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { if (!list_empty(&info->block_groups[c])) { get_block_group_info(&info->block_groups[c], &space); memcpy(dest, &space, sizeof(space)); dest++; space_args.total_spaces++; slot_count--; } if (!slot_count) break; } up_read(&info->groups_sem); } /* * Add global block reserve */ if (slot_count) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; spin_lock(&block_rsv->lock); space.total_bytes = block_rsv->size; space.used_bytes = block_rsv->size - block_rsv->reserved; spin_unlock(&block_rsv->lock); space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; memcpy(dest, &space, sizeof(space)); space_args.total_spaces++; } user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); if (copy_to_user(user_dest, dest_orig, alloc_size)) ret = -EFAULT; kfree(dest_orig); out: if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) ret = -EFAULT; return ret; } static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, void __user *argp) { struct btrfs_trans_handle *trans; u64 transid; int ret; trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) return PTR_ERR(trans); /* No running transaction, don't bother */ transid = root->fs_info->last_trans_committed; goto out; } transid = trans->transid; ret = btrfs_commit_transaction_async(trans); if (ret) { btrfs_end_transaction(trans); return ret; } out: if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) return -EFAULT; return 0; } static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, void __user *argp) { u64 transid; if (argp) { if (copy_from_user(&transid, argp, sizeof(transid))) return -EFAULT; } else { transid = 0; /* current trans */ } return btrfs_wait_for_commit(fs_info, transid); } static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); struct btrfs_ioctl_scrub_args *sa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) { ret = -EOPNOTSUPP; goto out; } if (!(sa->flags & BTRFS_SCRUB_READONLY)) { ret = mnt_want_write_file(file); if (ret) goto out; } ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end, &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); /* * Copy scrub args to user space even if btrfs_scrub_dev() returned an * error. This is important as it allows user space to know how much * progress scrub has done. For example, if scrub is canceled we get * -ECANCELED from btrfs_scrub_dev() and return that error back to user * space. Later user space can inspect the progress from the structure * btrfs_ioctl_scrub_args and resume scrub from where it left off * previously (btrfs-progs does this). * If we fail to copy the btrfs_ioctl_scrub_args structure to user space * then return -EFAULT to signal the structure was not copied or it may * be corrupt and unreliable due to a partial copy. */ if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; if (!(sa->flags & BTRFS_SCRUB_READONLY)) mnt_drop_write_file(file); out: kfree(sa); return ret; } static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return btrfs_scrub_cancel(fs_info); } static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_scrub_args *sa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress); if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; kfree(sa); return ret; } static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_get_dev_stats *sa; int ret; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) { kfree(sa); return -EPERM; } ret = btrfs_get_dev_stats(fs_info, sa); if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; kfree(sa); return ret; } static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_dev_replace_args *p; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; p = memdup_user(arg, sizeof(*p)); if (IS_ERR(p)) return PTR_ERR(p); switch (p->cmd) { case BTRFS_IOCTL_DEV_REPLACE_CMD_START: if (sb_rdonly(fs_info->sb)) { ret = -EROFS; goto out; } if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); btrfs_exclop_finish(fs_info); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: btrfs_dev_replace_status(fs_info, p); ret = 0; break; case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: p->result = btrfs_dev_replace_cancel(fs_info); ret = 0; break; default: ret = -EINVAL; break; } if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p))) ret = -EFAULT; out: kfree(p); return ret; } static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; int i; u64 rel_ptr; int size; struct btrfs_ioctl_ino_path_args *ipa = NULL; struct inode_fs_paths *ipath = NULL; struct btrfs_path *path; if (!capable(CAP_DAC_READ_SEARCH)) return -EPERM; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ipa = memdup_user(arg, sizeof(*ipa)); if (IS_ERR(ipa)) { ret = PTR_ERR(ipa); ipa = NULL; goto out; } size = min_t(u32, ipa->size, 4096); ipath = init_ipath(size, root, path); if (IS_ERR(ipath)) { ret = PTR_ERR(ipath); ipath = NULL; goto out; } ret = paths_from_inode(ipa->inum, ipath); if (ret < 0) goto out; for (i = 0; i < ipath->fspath->elem_cnt; ++i) { rel_ptr = ipath->fspath->val[i] - (u64)(unsigned long)ipath->fspath->val; ipath->fspath->val[i] = rel_ptr; } btrfs_free_path(path); path = NULL; ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, ipath->fspath, size); if (ret) { ret = -EFAULT; goto out; } out: btrfs_free_path(path); free_ipath(ipath); kfree(ipa); return ret; } static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, void __user *arg, int version) { int ret = 0; int size; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; bool ignore_offset; if (!capable(CAP_SYS_ADMIN)) return -EPERM; loi = memdup_user(arg, sizeof(*loi)); if (IS_ERR(loi)) return PTR_ERR(loi); if (version == 1) { ignore_offset = false; size = min_t(u32, loi->size, SZ_64K); } else { /* All reserved bits must be 0 for now */ if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) { ret = -EINVAL; goto out_loi; } /* Only accept flags we have defined so far */ if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) { ret = -EINVAL; goto out_loi; } ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET; size = min_t(u32, loi->size, SZ_16M); } inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); goto out_loi; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, inodes, ignore_offset); btrfs_free_path(path); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) goto out; ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes, size); if (ret) ret = -EFAULT; out: kvfree(inodes); out_loi: kfree(loi); return ret; } void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; bargs->flags = bctl->flags; if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) bargs->state |= BTRFS_BALANCE_STATE_RUNNING; if (atomic_read(&fs_info->balance_pause_req)) bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; if (atomic_read(&fs_info->balance_cancel_req)) bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); spin_lock(&fs_info->balance_lock); memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); spin_unlock(&fs_info->balance_lock); } static long btrfs_ioctl_balance(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; bool need_unlock; /* for mut. excl. ops lock */ int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; again: if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; } /* * mut. excl. ops lock is locked. Three possibilities: * (1) some other op is running * (2) balance is running * (3) balance is paused -- special case (think resume) */ mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { /* this is either (2) or (3) */ if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { mutex_unlock(&fs_info->balance_mutex); /* * Lock released to allow other waiters to continue, * we'll reexamine the status again. */ mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl && !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { /* this is (3) */ need_unlock = false; goto locked; } mutex_unlock(&fs_info->balance_mutex); goto again; } else { /* this is (2) */ mutex_unlock(&fs_info->balance_mutex); ret = -EINPROGRESS; goto out; } } else { /* this is (1) */ mutex_unlock(&fs_info->balance_mutex); ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } locked: if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); if (IS_ERR(bargs)) { ret = PTR_ERR(bargs); goto out_unlock; } if (bargs->flags & BTRFS_BALANCE_RESUME) { if (!fs_info->balance_ctl) { ret = -ENOTCONN; goto out_bargs; } bctl = fs_info->balance_ctl; spin_lock(&fs_info->balance_lock); bctl->flags |= BTRFS_BALANCE_RESUME; spin_unlock(&fs_info->balance_lock); goto do_balance; } } else { bargs = NULL; } if (fs_info->balance_ctl) { ret = -EINPROGRESS; goto out_bargs; } bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; goto out_bargs; } if (arg) { memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); bctl->flags = bargs->flags; } else { /* balance everything - no filters */ bctl->flags |= BTRFS_BALANCE_TYPE_MASK; } if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { ret = -EINVAL; goto out_bctl; } do_balance: /* * Ownership of bctl and exclusive operation goes to btrfs_balance. * bctl is freed in reset_balance_state, or, if restriper was paused * all the way until unmount, in free_fs_info. The flag should be * cleared after reset_balance_state. */ need_unlock = false; ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; if ((ret == 0 || ret == -ECANCELED) && arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } out_bctl: kfree(bctl); out_bargs: kfree(bargs); out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { case BTRFS_BALANCE_CTL_PAUSE: return btrfs_pause_balance(fs_info); case BTRFS_BALANCE_CTL_CANCEL: return btrfs_cancel_balance(fs_info); } return -EINVAL; } static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_balance_args *bargs; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { ret = -ENOTCONN; goto out; } bargs = kzalloc(sizeof(*bargs), GFP_KERNEL); if (!bargs) { ret = -ENOMEM; goto out; } btrfs_update_ioctl_balance_args(fs_info, bargs); if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; kfree(bargs); out: mutex_unlock(&fs_info->balance_mutex); return ret; } static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_quota_ctl_args *sa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) { ret = PTR_ERR(sa); goto drop_write; } down_write(&fs_info->subvol_sem); switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: ret = btrfs_quota_enable(fs_info); break; case BTRFS_QUOTA_CTL_DISABLE: ret = btrfs_quota_disable(fs_info); break; default: ret = -EINVAL; break; } kfree(sa); up_write(&fs_info->subvol_sem); drop_write: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; struct btrfs_trans_handle *trans; int ret; int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) { ret = PTR_ERR(sa); goto drop_write; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } if (sa->assign) { ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst); } else { ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst); } /* update qgroup status and info */ mutex_lock(&fs_info->qgroup_ioctl_lock); err = btrfs_run_qgroups(trans); mutex_unlock(&fs_info->qgroup_ioctl_lock); if (err < 0) btrfs_handle_fs_error(fs_info, err, "failed to update qgroup status and info"); err = btrfs_end_transaction(trans); if (err && !ret) ret = err; out: kfree(sa); drop_write: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_create_args *sa; struct btrfs_trans_handle *trans; int ret; int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) { ret = PTR_ERR(sa); goto drop_write; } if (!sa->qgroupid) { ret = -EINVAL; goto out; } if (sa->create && is_fstree(sa->qgroupid)) { ret = -EINVAL; goto out; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } if (sa->create) { ret = btrfs_create_qgroup(trans, sa->qgroupid); } else { ret = btrfs_remove_qgroup(trans, sa->qgroupid); } err = btrfs_end_transaction(trans); if (err && !ret) ret = err; out: kfree(sa); drop_write: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_limit_args *sa; struct btrfs_trans_handle *trans; int ret; int err; u64 qgroupid; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) { ret = PTR_ERR(sa); goto drop_write; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } qgroupid = sa->qgroupid; if (!qgroupid) { /* take the current subvol as qgroup */ qgroupid = root->root_key.objectid; } ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim); err = btrfs_end_transaction(trans); if (err && !ret) ret = err; out: kfree(sa); drop_write: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_quota_rescan_args *qsa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; qsa = memdup_user(arg, sizeof(*qsa)); if (IS_ERR(qsa)) { ret = PTR_ERR(qsa); goto drop_write; } if (qsa->flags) { ret = -EINVAL; goto out; } ret = btrfs_qgroup_rescan(fs_info); out: kfree(qsa); drop_write: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_quota_rescan_args qsa = {0}; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { qsa.flags = 1; qsa.progress = fs_info->qgroup_rescan_progress.objectid; } if (copy_to_user(arg, &qsa, sizeof(qsa))) ret = -EFAULT; return ret; } static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, void __user *arg) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return btrfs_qgroup_wait_for_completion(fs_info, true); } static long _btrfs_ioctl_set_received_subvol(struct file *file, struct user_namespace *mnt_userns, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; struct timespec64 ct = current_time(inode); int ret = 0; int received_uuid_changed; if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; ret = mnt_want_write_file(file); if (ret < 0) return ret; down_write(&fs_info->subvol_sem); if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; goto out; } if (btrfs_root_readonly(root)) { ret = -EROFS; goto out; } /* * 1 - root item * 2 - uuid items (received uuid + subvol uuid) */ trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } sa->rtransid = trans->transid; sa->rtime.sec = ct.tv_sec; sa->rtime.nsec = ct.tv_nsec; received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); if (received_uuid_changed && !btrfs_is_empty_uuid(root_item->received_uuid)) { ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; } } memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); btrfs_set_root_stransid(root_item, sa->stransid); btrfs_set_root_rtransid(root_item, sa->rtransid); btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (ret < 0) { btrfs_end_transaction(trans); goto out; } if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; } } ret = btrfs_commit_transaction(trans); out: up_write(&fs_info->subvol_sem); mnt_drop_write_file(file); return ret; } #ifdef CONFIG_64BIT static long btrfs_ioctl_set_received_subvol_32(struct file *file, void __user *arg) { struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; struct btrfs_ioctl_received_subvol_args *args64 = NULL; int ret = 0; args32 = memdup_user(arg, sizeof(*args32)); if (IS_ERR(args32)) return PTR_ERR(args32); args64 = kmalloc(sizeof(*args64), GFP_KERNEL); if (!args64) { ret = -ENOMEM; goto out; } memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); args64->stransid = args32->stransid; args64->rtransid = args32->rtransid; args64->stime.sec = args32->stime.sec; args64->stime.nsec = args32->stime.nsec; args64->rtime.sec = args32->rtime.sec; args64->rtime.nsec = args32->rtime.nsec; args64->flags = args32->flags; ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64); if (ret) goto out; memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); args32->stransid = args64->stransid; args32->rtransid = args64->rtransid; args32->stime.sec = args64->stime.sec; args32->stime.nsec = args64->stime.nsec; args32->rtime.sec = args64->rtime.sec; args32->rtime.nsec = args64->rtime.nsec; args32->flags = args64->flags; ret = copy_to_user(arg, args32, sizeof(*args32)); if (ret) ret = -EFAULT; out: kfree(args32); kfree(args64); return ret; } #endif static long btrfs_ioctl_set_received_subvol(struct file *file, void __user *arg) { struct btrfs_ioctl_received_subvol_args *sa = NULL; int ret = 0; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa); if (ret) goto out; ret = copy_to_user(arg, sa, sizeof(*sa)); if (ret) ret = -EFAULT; out: kfree(sa); return ret; } static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info, void __user *arg) { size_t len; int ret; char label[BTRFS_LABEL_SIZE]; spin_lock(&fs_info->super_lock); memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE); spin_unlock(&fs_info->super_lock); len = strnlen(label, BTRFS_LABEL_SIZE); if (len == BTRFS_LABEL_SIZE) { btrfs_warn(fs_info, "label is too long, return the first %zu bytes", --len); } ret = copy_to_user(arg, label, len); return ret ? -EFAULT : 0; } static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_trans_handle *trans; char label[BTRFS_LABEL_SIZE]; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(label, arg, sizeof(label))) return -EFAULT; if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { btrfs_err(fs_info, "unable to set label with more than %d bytes", BTRFS_LABEL_SIZE - 1); return -EINVAL; } ret = mnt_want_write_file(file); if (ret) return ret; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_unlock; } spin_lock(&fs_info->super_lock); strcpy(super_block->label, label); spin_unlock(&fs_info->super_lock); ret = btrfs_commit_transaction(trans); out_unlock: mnt_drop_write_file(file); return ret; } #define INIT_FEATURE_FLAGS(suffix) \ { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } int btrfs_ioctl_get_supported_features(void __user *arg) { static const struct btrfs_ioctl_feature_flags features[3] = { INIT_FEATURE_FLAGS(SUPP), INIT_FEATURE_FLAGS(SAFE_SET), INIT_FEATURE_FLAGS(SAFE_CLEAR) }; if (copy_to_user(arg, &features, sizeof(features))) return -EFAULT; return 0; } static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags features; features.compat_flags = btrfs_super_compat_flags(super_block); features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); features.incompat_flags = btrfs_super_incompat_flags(super_block); if (copy_to_user(arg, &features, sizeof(features))) return -EFAULT; return 0; } static int check_feature_bits(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) { const char *type = btrfs_feature_set_name(set); char *names; u64 disallowed, unsupported; u64 set_mask = flags & change_mask; u64 clear_mask = ~flags & change_mask; unsupported = set_mask & ~supported_flags; if (unsupported) { names = btrfs_printable_features(set, unsupported); if (names) { btrfs_warn(fs_info, "this kernel does not support the %s feature bit%s", names, strchr(names, ',') ? "s" : ""); kfree(names); } else btrfs_warn(fs_info, "this kernel does not support %s bits 0x%llx", type, unsupported); return -EOPNOTSUPP; } disallowed = set_mask & ~safe_set; if (disallowed) { names = btrfs_printable_features(set, disallowed); if (names) { btrfs_warn(fs_info, "can't set the %s feature bit%s while mounted", names, strchr(names, ',') ? "s" : ""); kfree(names); } else btrfs_warn(fs_info, "can't set %s bits 0x%llx while mounted", type, disallowed); return -EPERM; } disallowed = clear_mask & ~safe_clear; if (disallowed) { names = btrfs_printable_features(set, disallowed); if (names) { btrfs_warn(fs_info, "can't clear the %s feature bit%s while mounted", names, strchr(names, ',') ? "s" : ""); kfree(names); } else btrfs_warn(fs_info, "can't clear %s bits 0x%llx while mounted", type, disallowed); return -EPERM; } return 0; } #define check_feature(fs_info, change_mask, flags, mask_base) \ check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ BTRFS_FEATURE_ ## mask_base ## _SUPP, \ BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) static int btrfs_ioctl_set_features(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags flags[2]; struct btrfs_trans_handle *trans; u64 newflags; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(flags, arg, sizeof(flags))) return -EFAULT; /* Nothing to do */ if (!flags[0].compat_flags && !flags[0].compat_ro_flags && !flags[0].incompat_flags) return 0; ret = check_feature(fs_info, flags[0].compat_flags, flags[1].compat_flags, COMPAT); if (ret) return ret; ret = check_feature(fs_info, flags[0].compat_ro_flags, flags[1].compat_ro_flags, COMPAT_RO); if (ret) return ret; ret = check_feature(fs_info, flags[0].incompat_flags, flags[1].incompat_flags, INCOMPAT); if (ret) return ret; ret = mnt_want_write_file(file); if (ret) return ret; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_drop_write; } spin_lock(&fs_info->super_lock); newflags = btrfs_super_compat_flags(super_block); newflags |= flags[0].compat_flags & flags[1].compat_flags; newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); btrfs_set_super_compat_flags(super_block, newflags); newflags = btrfs_super_compat_ro_flags(super_block); newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); btrfs_set_super_compat_ro_flags(super_block, newflags); newflags = btrfs_super_incompat_flags(super_block); newflags |= flags[0].incompat_flags & flags[1].incompat_flags; newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); btrfs_set_super_incompat_flags(super_block, newflags); spin_unlock(&fs_info->super_lock); ret = btrfs_commit_transaction(trans); out_drop_write: mnt_drop_write_file(file); return ret; } static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_send_args_32 args32 = { 0 }; ret = copy_from_user(&args32, argp, sizeof(args32)); if (ret) return -EFAULT; arg = kzalloc(sizeof(*arg), GFP_KERNEL); if (!arg) return -ENOMEM; arg->send_fd = args32.send_fd; arg->clone_sources_count = args32.clone_sources_count; arg->clone_sources = compat_ptr(args32.clone_sources); arg->parent_root = args32.parent_root; arg->flags = args32.flags; memcpy(arg->reserved, args32.reserved, sizeof(args32.reserved)); #else return -ENOTTY; #endif } else { arg = memdup_user(argp, sizeof(*arg)); if (IS_ERR(arg)) return PTR_ERR(arg); } ret = btrfs_ioctl_send(file, arg); kfree(arg); return ret; } long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; void __user *argp = (void __user *)arg; switch (cmd) { case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); case FS_IOC_GETFSLABEL: return btrfs_ioctl_get_fslabel(fs_info, argp); case FS_IOC_SETFSLABEL: return btrfs_ioctl_set_fslabel(file, argp); case FITRIM: return btrfs_ioctl_fitrim(fs_info, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SNAP_CREATE_V2: return btrfs_ioctl_snap_create_v2(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: return btrfs_ioctl_snap_create(file, argp, 1); case BTRFS_IOC_SUBVOL_CREATE_V2: return btrfs_ioctl_snap_create_v2(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp, false); case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: return btrfs_ioctl_subvol_getflags(file, argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: return btrfs_ioctl_default_subvol(file, argp); case BTRFS_IOC_DEFRAG: return btrfs_ioctl_defrag(file, NULL); case BTRFS_IOC_DEFRAG_RANGE: return btrfs_ioctl_defrag(file, argp); case BTRFS_IOC_RESIZE: return btrfs_ioctl_resize(file, argp); case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(fs_info, argp); case BTRFS_IOC_RM_DEV: return btrfs_ioctl_rm_dev(file, argp); case BTRFS_IOC_RM_DEV_V2: return btrfs_ioctl_rm_dev_v2(file, argp); case BTRFS_IOC_FS_INFO: return btrfs_ioctl_fs_info(fs_info, argp); case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); case BTRFS_IOC_BALANCE: return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_TREE_SEARCH: return btrfs_ioctl_tree_search(file, argp); case BTRFS_IOC_TREE_SEARCH_V2: return btrfs_ioctl_tree_search_v2(file, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(file, argp); case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: return btrfs_ioctl_logical_to_ino(fs_info, argp, 1); case BTRFS_IOC_LOGICAL_INO_V2: return btrfs_ioctl_logical_to_ino(fs_info, argp, 2); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(fs_info, argp); case BTRFS_IOC_SYNC: { int ret; ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); /* * The transaction thread may want to do more work, * namely it pokes the cleaner kthread that will start * processing uncleaned subvols. */ wake_up_process(fs_info->transaction_kthread); return ret; } case BTRFS_IOC_START_SYNC: return btrfs_ioctl_start_sync(root, argp); case BTRFS_IOC_WAIT_SYNC: return btrfs_ioctl_wait_sync(fs_info, argp); case BTRFS_IOC_SCRUB: return btrfs_ioctl_scrub(file, argp); case BTRFS_IOC_SCRUB_CANCEL: return btrfs_ioctl_scrub_cancel(fs_info); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(fs_info, argp); case BTRFS_IOC_BALANCE_V2: return btrfs_ioctl_balance(file, argp); case BTRFS_IOC_BALANCE_CTL: return btrfs_ioctl_balance_ctl(fs_info, arg); case BTRFS_IOC_BALANCE_PROGRESS: return btrfs_ioctl_balance_progress(fs_info, argp); case BTRFS_IOC_SET_RECEIVED_SUBVOL: return btrfs_ioctl_set_received_subvol(file, argp); #ifdef CONFIG_64BIT case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: return _btrfs_ioctl_send(file, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: return _btrfs_ioctl_send(file, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); case BTRFS_IOC_QUOTA_CTL: return btrfs_ioctl_quota_ctl(file, argp); case BTRFS_IOC_QGROUP_ASSIGN: return btrfs_ioctl_qgroup_assign(file, argp); case BTRFS_IOC_QGROUP_CREATE: return btrfs_ioctl_qgroup_create(file, argp); case BTRFS_IOC_QGROUP_LIMIT: return btrfs_ioctl_qgroup_limit(file, argp); case BTRFS_IOC_QUOTA_RESCAN: return btrfs_ioctl_quota_rescan(file, argp); case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(fs_info, argp); case BTRFS_IOC_QUOTA_RESCAN_WAIT: return btrfs_ioctl_quota_rescan_wait(fs_info, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: return btrfs_ioctl_get_supported_features(argp); case BTRFS_IOC_GET_FEATURES: return btrfs_ioctl_get_features(fs_info, argp); case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); case BTRFS_IOC_GET_SUBVOL_INFO: return btrfs_ioctl_get_subvol_info(file, argp); case BTRFS_IOC_GET_SUBVOL_ROOTREF: return btrfs_ioctl_get_subvol_rootref(file, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); case FS_IOC_ENABLE_VERITY: return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); } return -ENOTTY; } #ifdef CONFIG_COMPAT long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { /* * These all access 32-bit values anyway so no further * handling is necessary. */ switch (cmd) { case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; } return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif
12 12 12 1 12 5 11 7 13 4 11 6 12 6 10 5 10 10 6 6 6 6 6 6 6 6 6 6 6 6 1 1 5 5 5 5 5 5 1 1 1 1 1 1 68 68 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 // SPDX-License-Identifier: GPL-2.0 #include <linux/jhash.h> #include <linux/netfilter.h> #include <linux/rcupdate.h> #include <linux/rhashtable.h> #include <linux/vmalloc.h> #include <net/genetlink.h> #include <net/ila.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" struct ila_xlat_params { struct ila_params ip; int ifindex; }; struct ila_map { struct ila_xlat_params xp; struct rhash_head node; struct ila_map __rcu *next; struct rcu_head rcu; }; #define MAX_LOCKS 1024 #define LOCKS_PER_CPU 10 static int alloc_ila_locks(struct ila_net *ilan) { return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask, MAX_LOCKS, LOCKS_PER_CPU, GFP_KERNEL); } static u32 hashrnd __read_mostly; static __always_inline void __ila_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } static inline u32 ila_locator_hash(struct ila_locator loc) { u32 *v = (u32 *)loc.v32; __ila_hash_secret_init(); return jhash_2words(v[0], v[1], hashrnd); } static inline spinlock_t *ila_get_lock(struct ila_net *ilan, struct ila_locator loc) { return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask]; } static inline int ila_cmp_wildcards(struct ila_map *ila, struct ila_addr *iaddr, int ifindex) { return (ila->xp.ifindex && ila->xp.ifindex != ifindex); } static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *xp) { return (ila->xp.ifindex != xp->ifindex); } static int ila_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct ila_map *ila = obj; return (ila->xp.ip.locator_match.v64 != *(__be64 *)arg->key); } static inline int ila_order(struct ila_map *ila) { int score = 0; if (ila->xp.ifindex) score += 1 << 1; return score; } static const struct rhashtable_params rht_params = { .nelem_hint = 1024, .head_offset = offsetof(struct ila_map, node), .key_offset = offsetof(struct ila_map, xp.ip.locator_match), .key_len = sizeof(u64), /* identifier */ .max_size = 1048576, .min_size = 256, .automatic_shrinking = true, .obj_cmpfn = ila_cmpfn, }; static int parse_nl_config(struct genl_info *info, struct ila_xlat_params *xp) { memset(xp, 0, sizeof(*xp)); if (info->attrs[ILA_ATTR_LOCATOR]) xp->ip.locator.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR]); if (info->attrs[ILA_ATTR_LOCATOR_MATCH]) xp->ip.locator_match.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR_MATCH]); if (info->attrs[ILA_ATTR_CSUM_MODE]) xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]); else xp->ip.csum_mode = ILA_CSUM_NO_ACTION; if (info->attrs[ILA_ATTR_IDENT_TYPE]) xp->ip.ident_type = nla_get_u8( info->attrs[ILA_ATTR_IDENT_TYPE]); else xp->ip.ident_type = ILA_ATYPE_USE_FORMAT; if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); return 0; } /* Must be called with rcu readlock */ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr, int ifindex, struct ila_net *ilan) { struct ila_map *ila; ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc, rht_params); while (ila) { if (!ila_cmp_wildcards(ila, iaddr, ifindex)) return ila; ila = rcu_access_pointer(ila->next); } return NULL; } /* Must be called with rcu readlock */ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp, struct ila_net *ilan) { struct ila_map *ila; ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); while (ila) { if (!ila_cmp_params(ila, xp)) return ila; ila = rcu_access_pointer(ila->next); } return NULL; } static inline void ila_release(struct ila_map *ila) { kfree_rcu(ila, rcu); } static void ila_free_node(struct ila_map *ila) { struct ila_map *next; /* Assume rcu_readlock held */ while (ila) { next = rcu_access_pointer(ila->next); ila_release(ila); ila = next; } } static void ila_free_cb(void *ptr, void *arg) { ila_free_node((struct ila_map *)ptr); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila); static unsigned int ila_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { ila_xlat_addr(skb, false); return NF_ACCEPT; } static const struct nf_hook_ops ila_nf_hook_ops[] = { { .hook = ila_nf_input, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = -1, }, }; static DEFINE_MUTEX(ila_mutex); static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head; spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ mutex_lock(&ila_mutex); if (!ilan->xlat.hooks_registered) { err = nf_register_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); if (!err) WRITE_ONCE(ilan->xlat.hooks_registered, true); } mutex_unlock(&ila_mutex); if (err) return err; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); if (!ila) return -ENOMEM; ila_init_saved_csum(&xp->ip); ila->xp = *xp; order = ila_order(ila); spin_lock(lock); head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); if (!head) { /* New entry for the rhash_table */ err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); } else { struct ila_map *tila = head, *prev = NULL; do { if (!ila_cmp_params(tila, xp)) { err = -EEXIST; goto out; } if (order > ila_order(tila)) break; prev = tila; tila = rcu_dereference_protected(tila->next, lockdep_is_held(lock)); } while (tila); if (prev) { /* Insert in sub list of head */ RCU_INIT_POINTER(ila->next, tila); rcu_assign_pointer(prev->next, ila); } else { /* Make this ila new head */ RCU_INIT_POINTER(ila->next, head); err = rhashtable_replace_fast(&ilan->xlat.rhash_table, &head->node, &ila->node, rht_params); if (err) goto out; } } out: spin_unlock(lock); if (err) kfree(ila); return err; } static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head, *prev; spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = -ENOENT; spin_lock(lock); head = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &xp->ip.locator_match, rht_params); ila = head; prev = NULL; while (ila) { if (ila_cmp_params(ila, xp)) { prev = ila; ila = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); continue; } err = 0; if (prev) { /* Not head, just delete from list */ rcu_assign_pointer(prev->next, ila->next); } else { /* It is the head. If there is something in the * sublist we need to make a new head. */ head = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); if (head) { /* Put first entry in the sublist into the * table */ err = rhashtable_replace_fast( &ilan->xlat.rhash_table, &ila->node, &head->node, rht_params); if (err) goto out; } else { /* Entry no longer used */ err = rhashtable_remove_fast( &ilan->xlat.rhash_table, &ila->node, rht_params); } } ila_release(ila); break; } out: spin_unlock(lock); return err; } int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params p; int err; err = parse_nl_config(info, &p); if (err) return err; return ila_add_mapping(net, &p); } int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_xlat_params xp; int err; err = parse_nl_config(info, &xp); if (err) return err; ila_del_mapping(net, &xp); return 0; } static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan, struct ila_map *ila) { return ila_get_lock(ilan, ila->xp.ip.locator_match); } int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct rhashtable_iter iter; struct ila_map *ila; spinlock_t *lock; int ret = 0; rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter); rhashtable_walk_start(&iter); for (;;) { ila = rhashtable_walk_next(&iter); if (IS_ERR(ila)) { if (PTR_ERR(ila) == -EAGAIN) continue; ret = PTR_ERR(ila); goto done; } else if (!ila) { break; } lock = lock_from_ila_map(ilan, ila); spin_lock(lock); ret = rhashtable_remove_fast(&ilan->xlat.rhash_table, &ila->node, rht_params); if (!ret) ila_free_node(ila); spin_unlock(lock); if (ret) break; } done: rhashtable_walk_stop(&iter); rhashtable_walk_exit(&iter); return ret; } static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) { if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, (__force u64)ila->xp.ip.locator.v64, ILA_ATTR_PAD) || nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR_MATCH, (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || nla_put_u8(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode) || nla_put_u8(msg, ILA_ATTR_IDENT_TYPE, ila->xp.ip.ident_type)) return -1; return 0; } static int ila_dump_info(struct ila_map *ila, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &ila_nl_family, flags, cmd); if (!hdr) return -ENOMEM; if (ila_fill_info(ila, skb) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct sk_buff *msg; struct ila_xlat_params xp; struct ila_map *ila; int ret; ret = parse_nl_config(info, &xp); if (ret) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rcu_read_lock(); ret = -ESRCH; ila = ila_lookup_by_params(&xp, ilan); if (ila) { ret = ila_dump_info(ila, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); } rcu_read_unlock(); if (ret < 0) goto out_free; return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); return ret; } struct ila_dump_iter { struct rhashtable_iter rhiter; int skip; }; int ila_xlat_nl_dump_start(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_dump_iter *iter; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter->rhiter); iter->skip = 0; cb->args[0] = (long)iter; return 0; } int ila_xlat_nl_dump_done(struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; rhashtable_walk_exit(&iter->rhiter); kfree(iter); return 0; } int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0]; struct rhashtable_iter *rhiter = &iter->rhiter; int skip = iter->skip; struct ila_map *ila; int ret; rhashtable_walk_start(rhiter); /* Get first entry */ ila = rhashtable_walk_peek(rhiter); if (ila && !IS_ERR(ila) && skip) { /* Skip over visited entries */ while (ila && skip) { /* Skip over any ila entries in this list that we * have already dumped. */ ila = rcu_access_pointer(ila->next); skip--; } } skip = 0; for (;;) { if (IS_ERR(ila)) { ret = PTR_ERR(ila); if (ret == -EAGAIN) { /* Table has changed and iter has reset. Return * -EAGAIN to the application even if we have * written data to the skb. The application * needs to deal with this. */ goto out_ret; } else { break; } } else if (!ila) { ret = 0; break; } while (ila) { ret = ila_dump_info(ila, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, ILA_CMD_GET); if (ret) goto out; skip++; ila = rcu_access_pointer(ila->next); } skip = 0; ila = rhashtable_walk_next(rhiter); } out: iter->skip = skip; ret = (skb->len ? : ret); out_ret: rhashtable_walk_stop(rhiter); return ret; } int ila_xlat_init_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); int err; err = alloc_ila_locks(ilan); if (err) return err; rhashtable_init(&ilan->xlat.rhash_table, &rht_params); return 0; } void ila_xlat_pre_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); if (ilan->xlat.hooks_registered) nf_unregister_net_hooks(net, ila_nf_hook_ops, ARRAY_SIZE(ila_nf_hook_ops)); } void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); free_bucket_spinlocks(ilan->xlat.locks); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); /* Assumes skb contains a valid IPv6 header that is pulled */ /* No check here that ILA type in the mapping matches what is in the * address. We assume that whatever sender gaves us can be translated. * The checksum mode however is relevant. */ rcu_read_lock(); ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) ila_update_ipv6_locator(skb, &ila->xp.ip, sir2ila); rcu_read_unlock(); return 0; }
18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_icache.h" #include "xfs_bmap_btree.h" STATIC void xlog_recover_inode_ra_pass2( struct xlog *log, struct xlog_recover_item *item) { if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, &xfs_inode_buf_ra_ops); } else { struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, &xfs_inode_buf_ra_ops); } } /* * Inode fork owner changes * * If we have been told that we have to reparent the inode fork, it's because an * extent swap operation on a CRC enabled filesystem has been done and we are * replaying it. We need to walk the BMBT of the appropriate fork and change the * owners of it. * * The complexity here is that we don't have an inode context to work with, so * after we've replayed the inode we need to instantiate one. This is where the * fun begins. * * We are in the middle of log recovery, so we can't run transactions. That * means we cannot use cache coherent inode instantiation via xfs_iget(), as * that will result in the corresponding iput() running the inode through * xfs_inactive(). If we've just replayed an inode core that changes the link * count to zero (i.e. it's been unlinked), then xfs_inactive() will run * transactions (bad!). * * So, to avoid this, we instantiate an inode directly from the inode core we've * just recovered. We have the buffer still locked, and all we really need to * instantiate is the inode core and the forks being modified. We can do this * manually, then run the inode btree owner change, and then tear down the * xfs_inode without having to run any transactions at all. * * Also, because we don't have a transaction context available here but need to * gather all the buffers we modify for writeback so we pass the buffer_list * instead for the operation to use. */ STATIC int xfs_recover_inode_owner_change( struct xfs_mount *mp, struct xfs_dinode *dip, struct xfs_inode_log_format *in_f, struct list_head *buffer_list) { struct xfs_inode *ip; int error; ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); ip = xfs_inode_alloc(mp, in_f->ilf_ino); if (!ip) return -ENOMEM; /* instantiate the inode */ ASSERT(dip->di_version >= 3); error = xfs_inode_from_disk(ip, dip); if (error) goto out_free_ip; if (in_f->ilf_fields & XFS_ILOG_DOWNER) { ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, ip->i_ino, buffer_list); if (error) goto out_free_ip; } if (in_f->ilf_fields & XFS_ILOG_AOWNER) { ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, ip->i_ino, buffer_list); if (error) goto out_free_ip; } out_free_ip: xfs_inode_free(ip); return error; } static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) { return ld->di_version >= 3 && (ld->di_flags2 & XFS_DIFLAG2_BIGTIME); } /* Convert a log timestamp to an ondisk timestamp. */ static inline xfs_timestamp_t xfs_log_dinode_to_disk_ts( struct xfs_log_dinode *from, const xfs_log_timestamp_t its) { struct xfs_legacy_timestamp *lts; struct xfs_log_legacy_timestamp *lits; xfs_timestamp_t ts; if (xfs_log_dinode_has_bigtime(from)) return cpu_to_be64(its); lts = (struct xfs_legacy_timestamp *)&ts; lits = (struct xfs_log_legacy_timestamp *)&its; lts->t_sec = cpu_to_be32(lits->t_sec); lts->t_nsec = cpu_to_be32(lits->t_nsec); return ts; } STATIC void xfs_log_dinode_to_disk( struct xfs_log_dinode *from, struct xfs_dinode *to, xfs_lsn_t lsn) { to->di_magic = cpu_to_be16(from->di_magic); to->di_mode = cpu_to_be16(from->di_mode); to->di_version = from->di_version; to->di_format = from->di_format; to->di_onlink = 0; to->di_uid = cpu_to_be32(from->di_uid); to->di_gid = cpu_to_be32(from->di_gid); to->di_nlink = cpu_to_be32(from->di_nlink); to->di_projid_lo = cpu_to_be16(from->di_projid_lo); to->di_projid_hi = cpu_to_be16(from->di_projid_hi); memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime); to->di_size = cpu_to_be64(from->di_size); to->di_nblocks = cpu_to_be64(from->di_nblocks); to->di_extsize = cpu_to_be32(from->di_extsize); to->di_nextents = cpu_to_be32(from->di_nextents); to->di_anextents = cpu_to_be16(from->di_anextents); to->di_forkoff = from->di_forkoff; to->di_aformat = from->di_aformat; to->di_dmevmask = cpu_to_be32(from->di_dmevmask); to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); to->di_gen = cpu_to_be32(from->di_gen); if (from->di_version == 3) { to->di_changecount = cpu_to_be64(from->di_changecount); to->di_crtime = xfs_log_dinode_to_disk_ts(from, from->di_crtime); to->di_flags2 = cpu_to_be64(from->di_flags2); to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(lsn); memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &from->di_uuid); to->di_flushiter = 0; } else { to->di_flushiter = cpu_to_be16(from->di_flushiter); } } STATIC int xlog_recover_inode_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t current_lsn) { struct xfs_inode_log_format *in_f; struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; struct xfs_dinode *dip; int len; char *src; char *dest; int error; int attr_index; uint fields; struct xfs_log_dinode *ldip; uint isize; int need_free = 0; if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; } else { in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) goto error; } /* * Inode buffers can be freed, look out for it, * and do not replay the inode. */ if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) { error = 0; trace_xfs_log_recover_inode_cancel(log, in_f); goto error; } trace_xfs_log_recover_inode_recover(log, in_f); error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, &bp, &xfs_inode_buf_ops); if (error) goto error; ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = xfs_buf_offset(bp, in_f->ilf_boffset); /* * Make sure the place we're flushing out to really looks * like an inode! */ if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; } ldip = item->ri_buf[1].i_addr; if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", __func__, item, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; } /* * If the inode has an LSN in it, recover the inode only if the on-disk * inode's LSN is older than the lsn of the transaction we are * replaying. We can have multiple checkpoints with the same start LSN, * so the current LSN being equal to the on-disk LSN doesn't necessarily * mean that the on-disk inode is more recent than the change being * replayed. * * We must check the current_lsn against the on-disk inode * here because the we can't trust the log dinode to contain a valid LSN * (see comment below before replaying the log dinode for details). * * Note: we still need to replay an owner change even though the inode * is more recent than the transaction as there is no guarantee that all * the btree blocks are more recent than this transaction, too. */ if (dip->di_version >= 3) { xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) { trace_xfs_log_recover_inode_skip(log, in_f); error = 0; goto out_owner_change; } } /* * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes * are transactional and if ordering is necessary we can determine that * more accurately by the LSN field in the V3 inode core. Don't trust * the inode versions we might be changing them here - use the * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ if (!xfs_has_v3inodes(mp) && ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers */ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { trace_xfs_log_recover_inode_skip(log, in_f); error = 0; goto out_release; } } /* Take the opportunity to reset the flush iteration count */ ldip->di_flushiter = 0; if (unlikely(S_ISREG(ldip->di_mode))) { if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, "%s: Bad regular inode log record, rec ptr "PTR_FMT", " "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; } } else if (unlikely(S_ISDIR(ldip->di_mode))) { if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && (ldip->di_format != XFS_DINODE_FMT_BTREE) && (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, "%s: Bad dir inode log record, rec ptr "PTR_FMT", " "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = -EFSCORRUPTED; goto out_release; } } if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", __func__, item, dip, bp, in_f->ilf_ino, ldip->di_nextents + ldip->di_anextents, ldip->di_nblocks); error = -EFSCORRUPTED; goto out_release; } if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); error = -EFSCORRUPTED; goto out_release; } isize = xfs_log_dinode_size(mp); if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, "%s: Bad inode log record length %d, rec ptr "PTR_FMT, __func__, item->ri_buf[1].i_len, item); error = -EFSCORRUPTED; goto out_release; } /* * Recover the log dinode inode into the on disk inode. * * The LSN in the log dinode is garbage - it can be zero or reflect * stale in-memory runtime state that isn't coherent with the changes * logged in this transaction or the changes written to the on-disk * inode. Hence we write the current lSN into the inode because that * matches what xfs_iflush() would write inode the inode when flushing * the changes in this transaction. */ xfs_log_dinode_to_disk(ldip, dip, current_lsn); fields = in_f->ilf_fields; if (fields & XFS_ILOG_DEV) xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); if (in_f->ilf_size == 2) goto out_owner_change; len = item->ri_buf[2].i_len; src = item->ri_buf[2].i_addr; ASSERT(in_f->ilf_size <= 4); ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); ASSERT(!(fields & XFS_ILOG_DFORK) || (len == in_f->ilf_dsize)); switch (fields & XFS_ILOG_DFORK) { case XFS_ILOG_DDATA: case XFS_ILOG_DEXT: memcpy(XFS_DFORK_DPTR(dip), src, len); break; case XFS_ILOG_DBROOT: xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip), XFS_DFORK_DSIZE(dip, mp)); break; default: /* * There are no data fork flags set. */ ASSERT((fields & XFS_ILOG_DFORK) == 0); break; } /* * If we logged any attribute data, recover it. There may or * may not have been any other non-core data logged in this * transaction. */ if (in_f->ilf_fields & XFS_ILOG_AFORK) { if (in_f->ilf_fields & XFS_ILOG_DFORK) { attr_index = 3; } else { attr_index = 2; } len = item->ri_buf[attr_index].i_len; src = item->ri_buf[attr_index].i_addr; ASSERT(len == in_f->ilf_asize); switch (in_f->ilf_fields & XFS_ILOG_AFORK) { case XFS_ILOG_ADATA: case XFS_ILOG_AEXT: dest = XFS_DFORK_APTR(dip); ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); memcpy(dest, src, len); break; case XFS_ILOG_ABROOT: dest = XFS_DFORK_APTR(dip); xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, (struct xfs_bmdr_block *)dest, XFS_DFORK_ASIZE(dip, mp)); break; default: xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); error = -EFSCORRUPTED; goto out_release; } } out_owner_change: /* Recover the swapext owner change unless inode has been deleted */ if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && (dip->di_mode != 0)) error = xfs_recover_inode_owner_change(mp, dip, in_f, buffer_list); /* re-generate the checksum. */ xfs_dinode_calc_crc(log->l_mp, dip); ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; xfs_buf_delwri_queue(bp, buffer_list); out_release: xfs_buf_relse(bp); error: if (need_free) kmem_free(in_f); return error; } const struct xlog_recover_item_ops xlog_inode_item_ops = { .item_type = XFS_LI_INODE, .ra_pass2 = xlog_recover_inode_ra_pass2, .commit_pass2 = xlog_recover_inode_commit_pass2, };
1000 798 7664 2 8 155 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Tracing hooks * * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. * * This file defines hook entry points called by core code where * user tracing/debugging support might need to do something. These * entry points are called tracehook_*(). Each hook declared below * has a detailed kerneldoc comment giving the context (locking et * al) from which it is called, and the meaning of its return value. * * Each function here typically has only one call site, so it is ok * to have some nontrivial tracehook_*() inlines. In all cases, the * fast path when no tracing is enabled should be very short. * * The purpose of this file and the tracehook_* layer is to consolidate * the interface that the kernel core and arch code uses to enable any * user debugging or tracing facility (such as ptrace). The interfaces * here are carefully documented so that maintainers of core and arch * code do not need to think about the implementation details of the * tracing facilities. Likewise, maintainers of the tracing code do not * need to understand all the calling core or arch code in detail, just * documented circumstances of each call, such as locking conditions. * * If the calling core code changes so that locking is different, then * it is ok to change the interface documented here. The maintainer of * core code changing should notify the maintainers of the tracing code * that they need to work out the change. * * Some tracehook_*() inlines take arguments that the current tracing * implementations might not necessarily use. These function signatures * are chosen to pass in all the information that is on hand in the * caller and might conceivably be relevant to a tracer, so that the * core code won't have to be updated when tracing adds more features. * If a call site changes so that some of those parameters are no longer * already on hand without extra work, then the tracehook_* interface * can change so there is no make-work burden on the core code. The * maintainer of core code changing should notify the maintainers of the * tracing code that they need to work out the change. */ #ifndef _LINUX_TRACEHOOK_H #define _LINUX_TRACEHOOK_H 1 #include <linux/sched.h> #include <linux/ptrace.h> #include <linux/security.h> #include <linux/task_work.h> #include <linux/memcontrol.h> #include <linux/blk-cgroup.h> struct linux_binprm; /* * ptrace report for syscall entry and exit looks identical. */ static inline int ptrace_report_syscall(struct pt_regs *regs, unsigned long message) { int ptrace = current->ptrace; if (!(ptrace & PT_PTRACED)) return 0; current->ptrace_message = message; ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); /* * this isn't the same as continuing with a signal, but it will do * for normal use. strace only continues with a signal if the * stopping signal is not SIGTRAP. -brl */ if (current->exit_code) { send_sig(current->exit_code, current, 1); current->exit_code = 0; } current->ptrace_message = 0; return fatal_signal_pending(current); } /** * tracehook_report_syscall_entry - task is about to attempt a system call * @regs: user register state of current task * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just * entered the kernel for a system call. Full user register state is * available here. Changing the values in @regs can affect the system * call number and arguments to be tried. It is safe to block here, * preventing the system call from beginning. * * Returns zero normally, or nonzero if the calling arch code should abort * the system call. That must prevent normal entry so no system call is * made. If @task ever returns to user mode after this, its register state * is unspecified, but should be something harmless like an %ENOSYS error * return. It should preserve enough information so that syscall_rollback() * can work (see asm-generic/syscall.h). * * Called without locks, just after entering kernel mode. */ static inline __must_check int tracehook_report_syscall_entry( struct pt_regs *regs) { return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY); } /** * tracehook_report_syscall_exit - task has just finished a system call * @regs: user register state of current task * @step: nonzero if simulating single-step or block-step * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when * the current task has just finished an attempted system call. Full * user register state is available here. It is safe to block here, * preventing signals from being processed. * * If @step is nonzero, this report is also in lieu of the normal * trap that would follow the system call instruction because * user_enable_block_step() or user_enable_single_step() was used. * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set. * * Called without locks, just before checking for pending signals. */ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) { if (step) user_single_step_report(regs); else ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT); } /** * tracehook_signal_handler - signal handler setup is complete * @stepping: nonzero if debugger single-step or block-step in use * * Called by the arch code after a signal handler has been set up. * Register and stack state reflects the user handler about to run. * Signal mask changes have already been made. * * Called without locks, shortly before returning to user mode * (or handling more signals). */ static inline void tracehook_signal_handler(int stepping) { if (stepping) ptrace_notify(SIGTRAP); } /** * set_notify_resume - cause tracehook_notify_resume() to be called * @task: task that will call tracehook_notify_resume() * * Calling this arranges that @task will call tracehook_notify_resume() * before returning to user mode. If it's already running in user mode, * it will enter the kernel and call tracehook_notify_resume() soon. * If it's blocked, it will not be woken. */ static inline void set_notify_resume(struct task_struct *task) { #ifdef TIF_NOTIFY_RESUME if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME)) kick_process(task); #endif } /** * tracehook_notify_resume - report when about to return to user mode * @regs: user-mode registers of @current task * * This is called when %TIF_NOTIFY_RESUME has been set. Now we are * about to return to user mode, and the user state in @regs can be * inspected or adjusted. The caller in arch code has cleared * %TIF_NOTIFY_RESUME before the call. If the flag gets set again * asynchronously, this will be called again before we return to * user mode. * * Called without locks. */ static inline void tracehook_notify_resume(struct pt_regs *regs) { clear_thread_flag(TIF_NOTIFY_RESUME); /* * This barrier pairs with task_work_add()->set_notify_resume() after * hlist_add_head(task->task_works); */ smp_mb__after_atomic(); if (unlikely(current->task_works)) task_work_run(); #ifdef CONFIG_KEYS_REQUEST_CACHE if (unlikely(current->cached_requested_key)) { key_put(current->cached_requested_key); current->cached_requested_key = NULL; } #endif mem_cgroup_handle_over_high(); blkcg_maybe_throttle_current(); rseq_handle_notify_resume(NULL, regs); } /* * called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This * is currently used by TWA_SIGNAL based task_work, which requires breaking * wait loops to ensure that task_work is noticed and run. */ static inline void tracehook_notify_signal(void) { clear_thread_flag(TIF_NOTIFY_SIGNAL); smp_mb__after_atomic(); if (current->task_works) task_work_run(); } /* * Called when we have work to process from exit_to_user_mode_loop() */ static inline void set_notify_signal(struct task_struct *task) { if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && !wake_up_state(task, TASK_INTERRUPTIBLE)) kick_process(task); } #endif /* <linux/tracehook.h> */
14 13 18 122 16 151 59 14 16 142 37 36 108 136 133 4 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the UDP module. * * Version: @(#)udp.h 1.0.2 05/07/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Alan Cox : Turned on udp checksums. I don't want to * chase 'memory corruption' bugs that aren't! */ #ifndef _UDP_H #define _UDP_H #include <linux/list.h> #include <linux/bug.h> #include <net/inet_sock.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <linux/ipv6.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/indirect_call_wrapper.h> /** * struct udp_skb_cb - UDP(-Lite) private variables * * @header: private variables used by IPv4/IPv6 * @cscov: checksum coverage length (UDP-Lite only) * @partial_cov: if set indicates partial csum coverage */ struct udp_skb_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; __u16 cscov; __u8 partial_cov; }; #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** * struct udp_hslot - UDP hash slot * * @head: head of list of sockets * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { struct hlist_head head; int count; spinlock_t lock; } __attribute__((aligned(2 * sizeof(long)))); /** * struct udp_table - UDP table * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; struct udp_hslot *hash2; unsigned int mask; unsigned int log; }; extern struct udp_table udp_table; void udp_table_init(struct udp_table *, const char *); static inline struct udp_hslot *udp_hashslot(struct udp_table *table, struct net *net, unsigned int num) { return &table->hash[udp_hashfn(net, num, table->mask)]; } /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() */ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { return &table->hash2[hash & table->mask]; } extern struct proto udp_prot; extern atomic_long_t udp_memory_allocated; /* sysctl variables for udp */ extern long sysctl_udp_mem[3]; extern int sysctl_udp_rmem_min; extern int sysctl_udp_wmem_min; struct sk_buff; /* * Generic checksumming routines for UDP(-Lite) v4 and v6 */ static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb) { return (UDP_SKB_CB(skb)->cscov == skb->len ? __skb_checksum_complete(skb) : __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov)); } static inline int udp_lib_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __udp_lib_checksum_complete(skb); } /** * udp_csum_outgoing - compute UDPv4/v6 checksum over fragments * @sk: socket we are writing to * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) */ static inline __wsum udp_csum_outgoing(struct sock *sk, struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), 0); skb_queue_walk(&sk->sk_write_queue, skb) { csum = csum_add(csum, skb->csum); } return csum; } static inline __wsum udp_csum(struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), skb->csum); for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) { csum = csum_add(csum, skb->csum); } return csum; } static inline __sum16 udp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len); static inline void udp_csum_pull_header(struct sk_buff *skb) { if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE) skb->csum = csum_partial(skb->data, sizeof(struct udphdr), skb->csum); skb_pull_rcsum(skb, sizeof(struct udphdr)); UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr); } typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport, __be16 dport); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); void udp_v6_early_demux(struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { struct udphdr *uh; unsigned int hlen, off; off = skb_gro_offset(skb); hlen = off + sizeof(*uh); uh = skb_gro_header_fast(skb, off); if (skb_gro_header_hard(skb, hlen)) uh = skb_gro_header_slow(skb, hlen, off); return uh; } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { BUG(); return 0; } void udp_lib_unhash(struct sock *sk); void udp_lib_rehash(struct sock *sk, u16 new_hash); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); u32 udp_flow_hashrnd(void); static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, int min, int max, bool use_eth) { u32 hash; if (min >= max) { /* Use default range */ inet_get_local_port_range(net, &min, &max); } hash = skb_get_hash(skb); if (unlikely(!hash)) { if (use_eth) { /* Can't find a normal hash, caller has indicated an * Ethernet packet so use that to compute a hash. */ hash = jhash(skb->data, 2 * ETH_ALEN, (__force u32) skb->protocol); } else { /* Can't derive any sort of hash for the packet, set * to some consistent random value. */ hash = udp_flow_hashrnd(); } } /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an * attacker is leaked. Only upper 16 bits are relevant in the * computation for 16 bit port value. */ hash ^= hash << 16; return htons((((u64) hash * (max - min)) >> 32) + min); } static inline int udp_rqueue_get(struct sock *sk) { return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_udp_l3mdev_accept), bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); #endif } /* net/ipv4/udp.c */ void udp_destruct_common(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *off, int *err); static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *err) { int off = 0; return __skb_recv_udp(sk, flags, noblock, &off, err); } int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)); struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif); struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); int udp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() */ struct udp_dev_scratch { /* skb->truesize and the stateless bit are embedded in a single field; * do not use a bitfield since the compiler emits better/smaller code * this way */ u32 _tsize_state; #if BITS_PER_LONG == 64 /* len and the bit needed to compute skb_csum_unnecessary * will be on cold cache lines at recvmsg time. * skb->len can be stored on 16 bits since the udp header has been * already validated and pulled. */ u16 len; bool is_linear; bool csum_unnecessary; #endif }; static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb) { return (struct udp_dev_scratch *)&skb->dev_scratch; } #if BITS_PER_LONG == 64 static inline unsigned int udp_skb_len(struct sk_buff *skb) { return udp_skb_scratch(skb)->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return udp_skb_scratch(skb)->csum_unnecessary; } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return udp_skb_scratch(skb)->is_linear; } #else static inline unsigned int udp_skb_len(struct sk_buff *skb) { return skb->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return skb_csum_unnecessary(skb); } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return !skb_is_nonlinear(skb); } #endif static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { int n; n = copy_to_iter(skb->data + off, len, to); if (n == len) return 0; iov_iter_revert(to, n); return -EFAULT; } /* * SNMP statistics for UDP and UDP-Lite */ #define UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else __SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP6_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\ else __SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #define UDP6_INC_STATS(net, field, __lite) do { \ if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \ else SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #if IS_ENABLED(CONFIG_IPV6) #define __UDPX_MIB(sk, ipv4) \ ({ \ ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics) : \ (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 : \ sock_net(sk)->mib.udp_stats_in6); \ }) #else #define __UDPX_MIB(sk, ipv4) \ ({ \ IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics; \ }) #endif #define __UDPX_INC_STATS(sk, field) \ __SNMP_INC_STATS(__UDPX_MIB(sk, (sk)->sk_family == AF_INET), field) #ifdef CONFIG_PROC_FS struct udp_seq_afinfo { sa_family_t family; struct udp_table *udp_table; }; struct udp_iter_state { struct seq_net_private p; int bucket; struct udp_seq_afinfo *bpf_seq_afinfo; }; void *udp_seq_start(struct seq_file *seq, loff_t *pos); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void udp_seq_stop(struct seq_file *seq, void *v); extern const struct seq_operations udp_seq_ops; extern const struct seq_operations udp6_seq_ops; int udp4_proc_init(void); void udp4_proc_exit(void); #endif /* CONFIG_PROC_FS */ int udpv4_offload_init(void); void udp_init(void); DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void); void udp_encap_disable(void); #if IS_ENABLED(CONFIG_IPV6) DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void); #endif static inline struct sk_buff *udp_rcv_segment(struct sock *sk, struct sk_buff *skb, bool ipv4) { netdev_features_t features = NETIF_F_SG; struct sk_buff *segs; int drop_count; /* * Segmentation in UDP receive path is only for UDP GRO, drop udp * fragmentation offload (UFO) packets. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) { drop_count = 1; goto drop; } /* Avoid csum recalculation by skb_segment unless userspace explicitly * asks for the final checksum values */ if (!inet_get_convert_csum(sk)) features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; /* UDP segmentation expects packets of type CHECKSUM_PARTIAL or * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial * packets in udp_gro_complete_segment. As does UDP GSO, verified by * udp_send_skb. But when those packets are looped in dev_loopback_xmit * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY. * Reset in this specific case, where PARTIAL is both correct and * required. */ if (skb->pkt_type == PACKET_LOOPBACK) skb->ip_summed = CHECKSUM_PARTIAL; /* the GSO CB lays after the UDP one, no need to save and restore any * CB fragment */ segs = __skb_gso_segment(skb, features, false); if (IS_ERR_OR_NULL(segs)) { drop_count = skb_shinfo(skb)->gso_segs; goto drop; } consume_skb(skb); return segs; drop: atomic_add(drop_count, &sk->sk_drops); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count); kfree_skb(skb); return NULL; } static inline void udp_post_segment_fix_csum(struct sk_buff *skb) { /* UDP-lite can't land here - no GRO */ WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov); /* UDP packets generated with UDP_SEGMENT and traversing: * * UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx) * * can reach an UDP socket with CHECKSUM_NONE, because * __iptunnel_pull_header() converts CHECKSUM_PARTIAL into NONE. * SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets with no UDP tunnel will * have a valid checksum, as the GRO engine validates the UDP csum * before the aggregation and nobody strips such info in between. * Instead of adding another check in the tunnel fastpath, we can force * a valid csum after the segmentation. * Additionally fixup the UDP CB. */ UDP_SKB_CB(skb)->cscov = skb->len; if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid) skb->csum_valid = 1; } #ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); #endif #endif /* _UDP_H */
188 188 147 147 11 11 11 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 * Phillip Lougher <phillip@squashfs.org.uk> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/pagemap.h> #include "page_actor.h" /* * This file contains implementations of page_actor for decompressing into * an intermediate buffer, and for decompressing directly into the * page cache. * * Calling code should avoid sleeping between calls to squashfs_first_page() * and squashfs_finish_page(). */ /* Implementation of page_actor for decompressing into intermediate buffer */ static void *cache_first_page(struct squashfs_page_actor *actor) { actor->next_page = 1; return actor->buffer[0]; } static void *cache_next_page(struct squashfs_page_actor *actor) { if (actor->next_page == actor->pages) return NULL; return actor->buffer[actor->next_page++]; } static void cache_finish_page(struct squashfs_page_actor *actor) { /* empty */ } struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, int pages, int length) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); if (actor == NULL) return NULL; actor->length = length ? : pages * PAGE_SIZE; actor->buffer = buffer; actor->pages = pages; actor->next_page = 0; actor->squashfs_first_page = cache_first_page; actor->squashfs_next_page = cache_next_page; actor->squashfs_finish_page = cache_finish_page; return actor; } /* Implementation of page_actor for decompressing directly into page cache. */ static void *direct_first_page(struct squashfs_page_actor *actor) { actor->next_page = 1; return actor->pageaddr = kmap_atomic(actor->page[0]); } static void *direct_next_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) kunmap_atomic(actor->pageaddr); return actor->pageaddr = actor->next_page == actor->pages ? NULL : kmap_atomic(actor->page[actor->next_page++]); } static void direct_finish_page(struct squashfs_page_actor *actor) { if (actor->pageaddr) kunmap_atomic(actor->pageaddr); } struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page, int pages, int length) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); if (actor == NULL) return NULL; actor->length = length ? : pages * PAGE_SIZE; actor->page = page; actor->pages = pages; actor->next_page = 0; actor->pageaddr = NULL; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; actor->squashfs_finish_page = direct_finish_page; return actor; }
8 18 18 1 6 12 6 6 12 12 12 12 12 12 12 18 18 1 1 1 2 2 2 2 1 1 1 1 1 2 16 17 1 18 16 2 18 17 1 16 2 1 18 1 18 1 7 7 1 1 1 1 1 1 1 1 1 1 1 17 18 18 18 18 18 1 19 19 7 14 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 // SPDX-License-Identifier: GPL-2.0 /* * SME code for cfg80211 * both driver SME event handling and the SME implementation * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2009, 2020 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/wireless.h> #include <linux/export.h> #include <net/iw_handler.h> #include <net/cfg80211.h> #include <net/rtnetlink.h> #include "nl80211.h" #include "reg.h" #include "rdev-ops.h" /* * Software SME in cfg80211, using auth/assoc/deauth calls to the * driver. This is for implementing nl80211's connect/disconnect * and wireless extensions (if configured.) */ struct cfg80211_conn { struct cfg80211_connect_params params; /* these are sub-states of the _CONNECTING sme_state */ enum { CFG80211_CONN_SCANNING, CFG80211_CONN_SCAN_AGAIN, CFG80211_CONN_AUTHENTICATE_NEXT, CFG80211_CONN_AUTHENTICATING, CFG80211_CONN_AUTH_FAILED_TIMEOUT, CFG80211_CONN_ASSOCIATE_NEXT, CFG80211_CONN_ASSOCIATING, CFG80211_CONN_ASSOC_FAILED, CFG80211_CONN_ASSOC_FAILED_TIMEOUT, CFG80211_CONN_DEAUTH, CFG80211_CONN_ABANDON, CFG80211_CONN_CONNECTED, } state; u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; const u8 *ie; size_t ie_len; bool auto_auth, prev_bssid_valid; }; static void cfg80211_sme_free(struct wireless_dev *wdev) { if (!wdev->conn) return; kfree(wdev->conn->ie); kfree(wdev->conn); wdev->conn = NULL; } static int cfg80211_conn_scan(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_scan_request *request; int n_channels, err; ASSERT_WDEV_LOCK(wdev); if (rdev->scan_req || rdev->scan_msg) return -EBUSY; if (wdev->conn->params.channel) n_channels = 1; else n_channels = ieee80211_get_num_supported_channels(wdev->wiphy); request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) + sizeof(request->channels[0]) * n_channels, GFP_KERNEL); if (!request) return -ENOMEM; if (wdev->conn->params.channel) { enum nl80211_band band = wdev->conn->params.channel->band; struct ieee80211_supported_band *sband = wdev->wiphy->bands[band]; if (!sband) { kfree(request); return -EINVAL; } request->channels[0] = wdev->conn->params.channel; request->rates[band] = (1 << sband->n_bitrates) - 1; } else { int i = 0, j; enum nl80211_band band; struct ieee80211_supported_band *bands; struct ieee80211_channel *channel; for (band = 0; band < NUM_NL80211_BANDS; band++) { bands = wdev->wiphy->bands[band]; if (!bands) continue; for (j = 0; j < bands->n_channels; j++) { channel = &bands->channels[j]; if (channel->flags & IEEE80211_CHAN_DISABLED) continue; request->channels[i++] = channel; } request->rates[band] = (1 << bands->n_bitrates) - 1; } n_channels = i; } request->n_channels = n_channels; request->ssids = (void *)request + struct_size(request, channels, n_channels); request->n_ssids = 1; memcpy(request->ssids[0].ssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len); request->ssids[0].ssid_len = wdev->conn->params.ssid_len; eth_broadcast_addr(request->bssid); request->wdev = wdev; request->wiphy = &rdev->wiphy; request->scan_start = jiffies; rdev->scan_req = request; err = rdev_scan(rdev, request); if (!err) { wdev->conn->state = CFG80211_CONN_SCANNING; nl80211_send_scan_start(rdev, wdev); dev_hold(wdev->netdev); } else { rdev->scan_req = NULL; kfree(request); } return err; } static int cfg80211_conn_do_work(struct wireless_dev *wdev, enum nl80211_timeout_reason *treason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_connect_params *params; struct cfg80211_assoc_request req = {}; int err; ASSERT_WDEV_LOCK(wdev); if (!wdev->conn) return 0; params = &wdev->conn->params; switch (wdev->conn->state) { case CFG80211_CONN_SCANNING: /* didn't find it during scan ... */ return -ENOENT; case CFG80211_CONN_SCAN_AGAIN: return cfg80211_conn_scan(wdev); case CFG80211_CONN_AUTHENTICATE_NEXT: if (WARN_ON(!rdev->ops->auth)) return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_AUTHENTICATING; return cfg80211_mlme_auth(rdev, wdev->netdev, params->channel, params->auth_type, params->bssid, params->ssid, params->ssid_len, NULL, 0, params->key, params->key_len, params->key_idx, NULL, 0); case CFG80211_CONN_AUTH_FAILED_TIMEOUT: *treason = NL80211_TIMEOUT_AUTH; return -ENOTCONN; case CFG80211_CONN_ASSOCIATE_NEXT: if (WARN_ON(!rdev->ops->assoc)) return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_ASSOCIATING; if (wdev->conn->prev_bssid_valid) req.prev_bssid = wdev->conn->prev_bssid; req.ie = params->ie; req.ie_len = params->ie_len; req.use_mfp = params->mfp != NL80211_MFP_NO; req.crypto = params->crypto; req.flags = params->flags; req.ht_capa = params->ht_capa; req.ht_capa_mask = params->ht_capa_mask; req.vht_capa = params->vht_capa; req.vht_capa_mask = params->vht_capa_mask; err = cfg80211_mlme_assoc(rdev, wdev->netdev, params->channel, params->bssid, params->ssid, params->ssid_len, &req); if (err) cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); return err; case CFG80211_CONN_ASSOC_FAILED_TIMEOUT: *treason = NL80211_TIMEOUT_ASSOC; fallthrough; case CFG80211_CONN_ASSOC_FAILED: cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); return -ENOTCONN; case CFG80211_CONN_DEAUTH: cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); fallthrough; case CFG80211_CONN_ABANDON: /* free directly, disconnected event already sent */ cfg80211_sme_free(wdev); return 0; default: return 0; } } void cfg80211_conn_work(struct work_struct *work) { struct cfg80211_registered_device *rdev = container_of(work, struct cfg80211_registered_device, conn_work); struct wireless_dev *wdev; u8 bssid_buf[ETH_ALEN], *bssid = NULL; enum nl80211_timeout_reason treason; wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; wdev_lock(wdev); if (!netif_running(wdev->netdev)) { wdev_unlock(wdev); continue; } if (!wdev->conn || wdev->conn->state == CFG80211_CONN_CONNECTED) { wdev_unlock(wdev); continue; } if (wdev->conn->params.bssid) { memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN); bssid = bssid_buf; } treason = NL80211_TIMEOUT_UNSPECIFIED; if (cfg80211_conn_do_work(wdev, &treason)) { struct cfg80211_connect_resp_params cr; memset(&cr, 0, sizeof(cr)); cr.status = -1; cr.bssid = bssid; cr.timeout_reason = treason; __cfg80211_connect_result(wdev->netdev, &cr, false); } wdev_unlock(wdev); } wiphy_unlock(&rdev->wiphy); } static void cfg80211_step_auth_next(struct cfg80211_conn *conn, struct cfg80211_bss *bss) { memcpy(conn->bssid, bss->bssid, ETH_ALEN); conn->params.bssid = conn->bssid; conn->params.channel = bss->channel; conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; } /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; ASSERT_WDEV_LOCK(wdev); bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (!bss) return NULL; cfg80211_step_auth_next(wdev->conn, bss); schedule_work(&rdev->conn_work); return bss; } static void __cfg80211_sme_scan_done(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; ASSERT_WDEV_LOCK(wdev); if (!wdev->conn) return; if (wdev->conn->state != CFG80211_CONN_SCANNING && wdev->conn->state != CFG80211_CONN_SCAN_AGAIN) return; bss = cfg80211_get_conn_bss(wdev); if (bss) cfg80211_put_bss(&rdev->wiphy, bss); else schedule_work(&rdev->conn_work); } void cfg80211_sme_scan_done(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; wdev_lock(wdev); __cfg80211_sme_scan_done(dev); wdev_unlock(wdev); } void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; u16 status_code = le16_to_cpu(mgmt->u.auth.status_code); ASSERT_WDEV_LOCK(wdev); if (!wdev->conn || wdev->conn->state == CFG80211_CONN_CONNECTED) return; if (status_code == WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG && wdev->conn->auto_auth && wdev->conn->params.auth_type != NL80211_AUTHTYPE_NETWORK_EAP) { /* select automatically between only open, shared, leap */ switch (wdev->conn->params.auth_type) { case NL80211_AUTHTYPE_OPEN_SYSTEM: if (wdev->connect_keys) wdev->conn->params.auth_type = NL80211_AUTHTYPE_SHARED_KEY; else wdev->conn->params.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; break; case NL80211_AUTHTYPE_SHARED_KEY: wdev->conn->params.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; break; default: /* huh? */ wdev->conn->params.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; break; } wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; schedule_work(&rdev->conn_work); } else if (status_code != WLAN_STATUS_SUCCESS) { struct cfg80211_connect_resp_params cr; memset(&cr, 0, sizeof(cr)); cr.status = status_code; cr.bssid = mgmt->bssid; cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED; __cfg80211_connect_result(wdev->netdev, &cr, false); } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); } } bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return false; if (status == WLAN_STATUS_SUCCESS) { wdev->conn->state = CFG80211_CONN_CONNECTED; return false; } if (wdev->conn->prev_bssid_valid) { /* * Some stupid APs don't accept reassoc, so we * need to fall back to trying regular assoc; * return true so no event is sent to userspace. */ wdev->conn->prev_bssid_valid = false; wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); return true; } wdev->conn->state = CFG80211_CONN_ASSOC_FAILED; schedule_work(&rdev->conn_work); return false; } void cfg80211_sme_deauth(struct wireless_dev *wdev) { cfg80211_sme_free(wdev); } void cfg80211_sme_auth_timeout(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } void cfg80211_sme_disassoc(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_DEAUTH; schedule_work(&rdev->conn_work); } void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_ABANDON; schedule_work(&rdev->conn_work); } static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev, const u8 *ies, size_t ies_len, const u8 **out_ies, size_t *out_ies_len) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *buf; size_t offs; if (!rdev->wiphy.extended_capabilities_len || (ies && cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY, ies, ies_len))) { *out_ies = kmemdup(ies, ies_len, GFP_KERNEL); if (!*out_ies) return -ENOMEM; *out_ies_len = ies_len; return 0; } buf = kmalloc(ies_len + rdev->wiphy.extended_capabilities_len + 2, GFP_KERNEL); if (!buf) return -ENOMEM; if (ies_len) { static const u8 before_extcapa[] = { /* not listing IEs expected to be created by driver */ WLAN_EID_RSN, WLAN_EID_QOS_CAPA, WLAN_EID_RRM_ENABLED_CAPABILITIES, WLAN_EID_MOBILITY_DOMAIN, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, WLAN_EID_BSS_COEX_2040, }; offs = ieee80211_ie_split(ies, ies_len, before_extcapa, ARRAY_SIZE(before_extcapa), 0); memcpy(buf, ies, offs); /* leave a whole for extended capabilities IE */ memcpy(buf + offs + rdev->wiphy.extended_capabilities_len + 2, ies + offs, ies_len - offs); } else { offs = 0; } /* place extended capabilities IE (with only driver capabilities) */ buf[offs] = WLAN_EID_EXT_CAPABILITY; buf[offs + 1] = rdev->wiphy.extended_capabilities_len; memcpy(buf + offs + 2, rdev->wiphy.extended_capabilities, rdev->wiphy.extended_capabilities_len); *out_ies = buf; *out_ies_len = ies_len + rdev->wiphy.extended_capabilities_len + 2; return 0; } static int cfg80211_sme_connect(struct wireless_dev *wdev, struct cfg80211_connect_params *connect, const u8 *prev_bssid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; int err; if (!rdev->ops->auth || !rdev->ops->assoc) return -EOPNOTSUPP; if (wdev->current_bss) { cfg80211_unhold_bss(wdev->current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; cfg80211_sme_free(wdev); } if (wdev->conn) return -EINPROGRESS; wdev->conn = kzalloc(sizeof(*wdev->conn), GFP_KERNEL); if (!wdev->conn) return -ENOMEM; /* * Copy all parameters, and treat explicitly IEs, BSSID, SSID. */ memcpy(&wdev->conn->params, connect, sizeof(*connect)); if (connect->bssid) { wdev->conn->params.bssid = wdev->conn->bssid; memcpy(wdev->conn->bssid, connect->bssid, ETH_ALEN); } if (cfg80211_sme_get_conn_ies(wdev, connect->ie, connect->ie_len, &wdev->conn->ie, &wdev->conn->params.ie_len)) { kfree(wdev->conn); wdev->conn = NULL; return -ENOMEM; } wdev->conn->params.ie = wdev->conn->ie; if (connect->auth_type == NL80211_AUTHTYPE_AUTOMATIC) { wdev->conn->auto_auth = true; /* start with open system ... should mostly work */ wdev->conn->params.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; } else { wdev->conn->auto_auth = false; } wdev->conn->params.ssid = wdev->ssid; wdev->conn->params.ssid_len = wdev->ssid_len; /* see if we have the bss already */ bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (prev_bssid) { memcpy(wdev->conn->prev_bssid, prev_bssid, ETH_ALEN); wdev->conn->prev_bssid_valid = true; } /* we're good if we have a matching bss struct */ if (bss) { enum nl80211_timeout_reason treason; cfg80211_step_auth_next(wdev->conn, bss); err = cfg80211_conn_do_work(wdev, &treason); cfg80211_put_bss(wdev->wiphy, bss); } else { /* otherwise we'll need to scan for the AP first */ err = cfg80211_conn_scan(wdev); /* * If we can't scan right now, then we need to scan again * after the current scan finished, since the parameters * changed (unless we find a good AP anyway). */ if (err == -EBUSY) { err = 0; wdev->conn->state = CFG80211_CONN_SCAN_AGAIN; } } if (err) cfg80211_sme_free(wdev); return err; } static int cfg80211_sme_disconnect(struct wireless_dev *wdev, u16 reason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int err; if (!wdev->conn) return 0; if (!rdev->ops->deauth) return -EOPNOTSUPP; if (wdev->conn->state == CFG80211_CONN_SCANNING || wdev->conn->state == CFG80211_CONN_SCAN_AGAIN) { err = 0; goto out; } /* wdev->conn->params.bssid must be set if > SCANNING */ err = cfg80211_mlme_deauth(rdev, wdev->netdev, wdev->conn->params.bssid, NULL, 0, reason, false); out: cfg80211_sme_free(wdev); return err; } /* * code shared for in-device and software SME */ static bool cfg80211_is_all_idle(void) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; bool is_all_idle = true; /* * All devices must be idle as otherwise if you are actively * scanning some new beacon hints could be learned and would * count as new regulatory hints. * Also if there is any other active beaconing interface we * need not issue a disconnect hint and reset any info such * as chan dfs state, etc. */ list_for_each_entry(rdev, &cfg80211_rdev_list, list) { list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { wdev_lock(wdev); if (wdev->conn || wdev->current_bss || cfg80211_beaconing_iface_active(wdev)) is_all_idle = false; wdev_unlock(wdev); } } return is_all_idle; } static void disconnect_work(struct work_struct *work) { rtnl_lock(); if (cfg80211_is_all_idle()) regulatory_hint_disconnect(); rtnl_unlock(); } DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); /* * API calls for drivers implementing connect/disconnect and * SME event handling */ /* This method must consume bss one way or another */ void __cfg80211_connect_result(struct net_device *dev, struct cfg80211_connect_resp_params *cr, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; const u8 *country_ie; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif ASSERT_WDEV_LOCK(wdev); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) { cfg80211_put_bss(wdev->wiphy, cr->bss); return; } wdev->unprot_beacon_reported = 0; nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT if (wextev) { if (cr->req_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = cr->req_ie_len; wireless_send_event(dev, IWEVASSOCREQIE, &wrqu, cr->req_ie); } if (cr->resp_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = cr->resp_ie_len; wireless_send_event(dev, IWEVASSOCRESPIE, &wrqu, cr->resp_ie); } memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; if (cr->bssid && cr->status == WLAN_STATUS_SUCCESS) { memcpy(wrqu.ap_addr.sa_data, cr->bssid, ETH_ALEN); memcpy(wdev->wext.prev_bssid, cr->bssid, ETH_ALEN); wdev->wext.prev_bssid_valid = true; } wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); } #endif if (!cr->bss && (cr->status == WLAN_STATUS_SUCCESS)) { WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect); cr->bss = cfg80211_get_bss(wdev->wiphy, NULL, cr->bssid, wdev->ssid, wdev->ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (cr->bss) cfg80211_hold_bss(bss_from_pub(cr->bss)); } if (wdev->current_bss) { cfg80211_unhold_bss(wdev->current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; } if (cr->status != WLAN_STATUS_SUCCESS) { kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; wdev->ssid_len = 0; wdev->conn_owner_nlportid = 0; if (cr->bss) { cfg80211_unhold_bss(bss_from_pub(cr->bss)); cfg80211_put_bss(wdev->wiphy, cr->bss); } cfg80211_sme_free(wdev); return; } if (WARN_ON(!cr->bss)) return; wdev->current_bss = bss_from_pub(cr->bss); if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) cfg80211_upload_connect_keys(wdev); rcu_read_lock(); country_ie = ieee80211_bss_get_ie(cr->bss, WLAN_EID_COUNTRY); if (!country_ie) { rcu_read_unlock(); return; } country_ie = kmemdup(country_ie, 2 + country_ie[1], GFP_ATOMIC); rcu_read_unlock(); if (!country_ie) return; /* * ieee80211_bss_get_ie() ensures we can access: * - country_ie + 2, the start of the country ie data, and * - and country_ie[1] which is the IE length */ regulatory_hint_country_ie(wdev->wiphy, cr->bss->channel->band, country_ie + 2, country_ie[1]); kfree(country_ie); } /* Consumes bss object one way or another */ void cfg80211_connect_done(struct net_device *dev, struct cfg80211_connect_resp_params *params, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; u8 *next; if (params->bss) { struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss); if (list_empty(&ibss->list)) { struct cfg80211_bss *found = NULL, *tmp = params->bss; found = cfg80211_get_bss(wdev->wiphy, NULL, params->bss->bssid, wdev->ssid, wdev->ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (found) { /* The same BSS is already updated so use it * instead, as it has latest info. */ params->bss = found; } else { /* Update with BSS provided by driver, it will * be freshly added and ref cnted, we can free * the old one. * * signal_valid can be false, as we are not * expecting the BSS to be found. * * keep the old timestamp to avoid confusion */ cfg80211_bss_update(rdev, ibss, false, ibss->ts); } cfg80211_put_bss(wdev->wiphy, tmp); } } ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) + params->req_ie_len + params->resp_ie_len + params->fils.kek_len + params->fils.pmk_len + (params->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp); if (!ev) { cfg80211_put_bss(wdev->wiphy, params->bss); return; } ev->type = EVENT_CONNECT_RESULT; next = ((u8 *)ev) + sizeof(*ev); if (params->bssid) { ev->cr.bssid = next; memcpy((void *)ev->cr.bssid, params->bssid, ETH_ALEN); next += ETH_ALEN; } if (params->req_ie_len) { ev->cr.req_ie = next; ev->cr.req_ie_len = params->req_ie_len; memcpy((void *)ev->cr.req_ie, params->req_ie, params->req_ie_len); next += params->req_ie_len; } if (params->resp_ie_len) { ev->cr.resp_ie = next; ev->cr.resp_ie_len = params->resp_ie_len; memcpy((void *)ev->cr.resp_ie, params->resp_ie, params->resp_ie_len); next += params->resp_ie_len; } if (params->fils.kek_len) { ev->cr.fils.kek = next; ev->cr.fils.kek_len = params->fils.kek_len; memcpy((void *)ev->cr.fils.kek, params->fils.kek, params->fils.kek_len); next += params->fils.kek_len; } if (params->fils.pmk_len) { ev->cr.fils.pmk = next; ev->cr.fils.pmk_len = params->fils.pmk_len; memcpy((void *)ev->cr.fils.pmk, params->fils.pmk, params->fils.pmk_len); next += params->fils.pmk_len; } if (params->fils.pmkid) { ev->cr.fils.pmkid = next; memcpy((void *)ev->cr.fils.pmkid, params->fils.pmkid, WLAN_PMKID_LEN); next += WLAN_PMKID_LEN; } ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num; if (params->fils.update_erp_next_seq_num) ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num; if (params->bss) cfg80211_hold_bss(bss_from_pub(params->bss)); ev->cr.bss = params->bss; ev->cr.status = params->status; ev->cr.timeout_reason = params->timeout_reason; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_connect_done); /* Consumes bss object one way or another */ void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info) { #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif ASSERT_WDEV_LOCK(wdev); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) goto out; if (WARN_ON(!wdev->current_bss)) goto out; cfg80211_unhold_bss(wdev->current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; if (WARN_ON(!info->bss)) return; cfg80211_hold_bss(bss_from_pub(info->bss)); wdev->current_bss = bss_from_pub(info->bss); wdev->unprot_beacon_reported = 0; nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), wdev->netdev, info, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT if (info->req_ie) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = info->req_ie_len; wireless_send_event(wdev->netdev, IWEVASSOCREQIE, &wrqu, info->req_ie); } if (info->resp_ie) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = info->resp_ie_len; wireless_send_event(wdev->netdev, IWEVASSOCRESPIE, &wrqu, info->resp_ie); } memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; memcpy(wrqu.ap_addr.sa_data, info->bss->bssid, ETH_ALEN); memcpy(wdev->wext.prev_bssid, info->bss->bssid, ETH_ALEN); wdev->wext.prev_bssid_valid = true; wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL); #endif return; out: cfg80211_put_bss(wdev->wiphy, info->bss); } /* Consumes info->bss object one way or another */ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; u8 *next; if (!info->bss) { info->bss = cfg80211_get_bss(wdev->wiphy, info->channel, info->bssid, wdev->ssid, wdev->ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); } if (WARN_ON(!info->bss)) return; ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len + info->fils.kek_len + info->fils.pmk_len + (info->fils.pmkid ? WLAN_PMKID_LEN : 0), gfp); if (!ev) { cfg80211_put_bss(wdev->wiphy, info->bss); return; } ev->type = EVENT_ROAMED; next = ((u8 *)ev) + sizeof(*ev); if (info->req_ie_len) { ev->rm.req_ie = next; ev->rm.req_ie_len = info->req_ie_len; memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len); next += info->req_ie_len; } if (info->resp_ie_len) { ev->rm.resp_ie = next; ev->rm.resp_ie_len = info->resp_ie_len; memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len); next += info->resp_ie_len; } if (info->fils.kek_len) { ev->rm.fils.kek = next; ev->rm.fils.kek_len = info->fils.kek_len; memcpy((void *)ev->rm.fils.kek, info->fils.kek, info->fils.kek_len); next += info->fils.kek_len; } if (info->fils.pmk_len) { ev->rm.fils.pmk = next; ev->rm.fils.pmk_len = info->fils.pmk_len; memcpy((void *)ev->rm.fils.pmk, info->fils.pmk, info->fils.pmk_len); next += info->fils.pmk_len; } if (info->fils.pmkid) { ev->rm.fils.pmkid = next; memcpy((void *)ev->rm.fils.pmkid, info->fils.pmkid, WLAN_PMKID_LEN); next += WLAN_PMKID_LEN; } ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num; if (info->fils.update_erp_next_seq_num) ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num; ev->rm.bss = info->bss; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_roamed); void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid) { ASSERT_WDEV_LOCK(wdev); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return; if (WARN_ON(!wdev->current_bss) || WARN_ON(!ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) return; nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev, bssid); } void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; if (WARN_ON(!bssid)) return; ev = kzalloc(sizeof(*ev), gfp); if (!ev) return; ev->type = EVENT_PORT_AUTHORIZED; memcpy(ev->pa.bssid, bssid, ETH_ALEN); /* * Use the wdev event list so that if there are pending * connected/roamed events, they will be reported first. */ spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_port_authorized); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int i; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif ASSERT_WDEV_LOCK(wdev); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) return; if (wdev->current_bss) { cfg80211_unhold_bss(wdev->current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); } wdev->current_bss = NULL; wdev->ssid_len = 0; wdev->conn_owner_nlportid = 0; kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); /* stop critical protocol if supported */ if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) { rdev->crit_proto_nlportid = 0; rdev_crit_proto_stop(rdev, wdev); } /* * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. */ if (rdev->ops->del_key) { int max_key_idx = 5; if (wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION) || wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; for (i = 0; i <= max_key_idx; i++) rdev_del_key(rdev, dev, i, false, NULL); } rdev_set_qos_map(rdev, dev, NULL); #ifdef CONFIG_CFG80211_WEXT memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); wdev->wext.connect.ssid_len = 0; #endif schedule_work(&cfg80211_disconnect_work); } void cfg80211_disconnected(struct net_device *dev, u16 reason, const u8 *ie, size_t ie_len, bool locally_generated, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; ev = kzalloc(sizeof(*ev) + ie_len, gfp); if (!ev) return; ev->type = EVENT_DISCONNECTED; ev->dc.ie = ((u8 *)ev) + sizeof(*ev); ev->dc.ie_len = ie_len; memcpy((void *)ev->dc.ie, ie, ie_len); ev->dc.reason = reason; ev->dc.locally_generated = locally_generated; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_disconnected); /* * API calls for nl80211/wext compatibility code */ int cfg80211_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *connect, struct cfg80211_cached_keys *connkeys, const u8 *prev_bssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; ASSERT_WDEV_LOCK(wdev); /* * If we have an ssid_len, we're trying to connect or are * already connected, so reject a new SSID unless it's the * same (which is the case for re-association.) */ if (wdev->ssid_len && (wdev->ssid_len != connect->ssid_len || memcmp(wdev->ssid, connect->ssid, wdev->ssid_len))) return -EALREADY; /* * If connected, reject (re-)association unless prev_bssid * matches the current BSSID. */ if (wdev->current_bss) { if (!prev_bssid) return -EALREADY; if (!ether_addr_equal(prev_bssid, wdev->current_bss->pub.bssid)) return -ENOTCONN; } /* * Reject if we're in the process of connecting with WEP, * this case isn't very interesting and trying to handle * it would make the code much more complex. */ if (wdev->connect_keys) return -EINPROGRESS; cfg80211_oper_and_ht_capa(&connect->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); cfg80211_oper_and_vht_capa(&connect->vht_capa_mask, rdev->wiphy.vht_capa_mod_mask); if (connkeys && connkeys->def >= 0) { int idx; u32 cipher; idx = connkeys->def; cipher = connkeys->params[idx].cipher; /* If given a WEP key we may need it for shared key auth */ if (cipher == WLAN_CIPHER_SUITE_WEP40 || cipher == WLAN_CIPHER_SUITE_WEP104) { connect->key_idx = idx; connect->key = connkeys->params[idx].key; connect->key_len = connkeys->params[idx].key_len; /* * If ciphers are not set (e.g. when going through * iwconfig), we have to set them appropriately here. */ if (connect->crypto.cipher_group == 0) connect->crypto.cipher_group = cipher; if (connect->crypto.n_ciphers_pairwise == 0) { connect->crypto.n_ciphers_pairwise = 1; connect->crypto.ciphers_pairwise[0] = cipher; } } connect->crypto.wep_keys = connkeys->params; connect->crypto.wep_tx_key = connkeys->def; } else { if (WARN_ON(connkeys)) return -EINVAL; /* connect can point to wdev->wext.connect which * can hold key data from a previous connection */ connect->key = NULL; connect->key_len = 0; connect->key_idx = 0; } wdev->connect_keys = connkeys; memcpy(wdev->ssid, connect->ssid, connect->ssid_len); wdev->ssid_len = connect->ssid_len; wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS : IEEE80211_BSS_TYPE_ESS; if (!rdev->ops->connect) err = cfg80211_sme_connect(wdev, connect, prev_bssid); else err = rdev_connect(rdev, dev, connect); if (err) { wdev->connect_keys = NULL; /* * This could be reassoc getting refused, don't clear * ssid_len in that case. */ if (!wdev->current_bss) wdev->ssid_len = 0; return err; } return 0; } int cfg80211_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err = 0; ASSERT_WDEV_LOCK(wdev); kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; wdev->conn_owner_nlportid = 0; if (wdev->conn) err = cfg80211_sme_disconnect(wdev, reason); else if (!rdev->ops->disconnect) cfg80211_mlme_down(rdev, dev); else if (wdev->ssid_len) err = rdev_disconnect(rdev, dev, reason); /* * Clear ssid_len unless we actually were fully connected, * in which case cfg80211_disconnected() will take care of * this later. */ if (!wdev->current_bss) wdev->ssid_len = 0; return err; } /* * Used to clean up after the connection / connection attempt owner socket * disconnects */ void cfg80211_autodisconnect_wk(struct work_struct *work) { struct wireless_dev *wdev = container_of(work, struct wireless_dev, disconnect_wk); struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); wdev_lock(wdev); if (wdev->conn_owner_nlportid) { switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: __cfg80211_leave_ibss(rdev, wdev->netdev, false); break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: __cfg80211_stop_ap(rdev, wdev->netdev, false); break; case NL80211_IFTYPE_MESH_POINT: __cfg80211_leave_mesh(rdev, wdev->netdev); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: /* * Use disconnect_bssid if still connecting and * ops->disconnect not implemented. Otherwise we can * use cfg80211_disconnect. */ if (rdev->ops->disconnect || wdev->current_bss) cfg80211_disconnect(rdev, wdev->netdev, WLAN_REASON_DEAUTH_LEAVING, true); else cfg80211_mlme_deauth(rdev, wdev->netdev, wdev->disconnect_bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); break; default: break; } } wdev_unlock(wdev); }
10 11 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 // SPDX-License-Identifier: GPL-2.0-only /* * drivers/acpi/device_sysfs.c - ACPI device sysfs attributes and modalias. * * Copyright (C) 2015, Intel Corp. * Author: Mika Westerberg <mika.westerberg@linux.intel.com> * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ #include <linux/acpi.h> #include <linux/device.h> #include <linux/export.h> #include <linux/nls.h> #include "internal.h" static ssize_t acpi_object_path(acpi_handle handle, char *buf) { struct acpi_buffer path = {ACPI_ALLOCATE_BUFFER, NULL}; int result; result = acpi_get_name(handle, ACPI_FULL_PATHNAME, &path); if (result) return result; result = sprintf(buf, "%s\n", (char *)path.pointer); kfree(path.pointer); return result; } struct acpi_data_node_attr { struct attribute attr; ssize_t (*show)(struct acpi_data_node *, char *); ssize_t (*store)(struct acpi_data_node *, const char *, size_t count); }; #define DATA_NODE_ATTR(_name) \ static struct acpi_data_node_attr data_node_##_name = \ __ATTR(_name, 0444, data_node_show_##_name, NULL) static ssize_t data_node_show_path(struct acpi_data_node *dn, char *buf) { return dn->handle ? acpi_object_path(dn->handle, buf) : 0; } DATA_NODE_ATTR(path); static struct attribute *acpi_data_node_default_attrs[] = { &data_node_path.attr, NULL }; #define to_data_node(k) container_of(k, struct acpi_data_node, kobj) #define to_attr(a) container_of(a, struct acpi_data_node_attr, attr) static ssize_t acpi_data_node_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct acpi_data_node *dn = to_data_node(kobj); struct acpi_data_node_attr *dn_attr = to_attr(attr); return dn_attr->show ? dn_attr->show(dn, buf) : -ENXIO; } static const struct sysfs_ops acpi_data_node_sysfs_ops = { .show = acpi_data_node_attr_show, }; static void acpi_data_node_release(struct kobject *kobj) { struct acpi_data_node *dn = to_data_node(kobj); complete(&dn->kobj_done); } static struct kobj_type acpi_data_node_ktype = { .sysfs_ops = &acpi_data_node_sysfs_ops, .default_attrs = acpi_data_node_default_attrs, .release = acpi_data_node_release, }; static void acpi_expose_nondev_subnodes(struct kobject *kobj, struct acpi_device_data *data) { struct list_head *list = &data->subnodes; struct acpi_data_node *dn; if (list_empty(list)) return; list_for_each_entry(dn, list, sibling) { int ret; init_completion(&dn->kobj_done); ret = kobject_init_and_add(&dn->kobj, &acpi_data_node_ktype, kobj, "%s", dn->name); if (!ret) acpi_expose_nondev_subnodes(&dn->kobj, &dn->data); else if (dn->handle) acpi_handle_err(dn->handle, "Failed to expose (%d)\n", ret); } } static void acpi_hide_nondev_subnodes(struct acpi_device_data *data) { struct list_head *list = &data->subnodes; struct acpi_data_node *dn; if (list_empty(list)) return; list_for_each_entry_reverse(dn, list, sibling) { acpi_hide_nondev_subnodes(&dn->data); kobject_put(&dn->kobj); } } /** * create_pnp_modalias - Create hid/cid(s) string for modalias and uevent * @acpi_dev: ACPI device object. * @modalias: Buffer to print into. * @size: Size of the buffer. * * Creates hid/cid(s) string needed for modalias and uevent * e.g. on a device with hid:IBM0001 and cid:ACPI0001 you get: * char *modalias: "acpi:IBM0001:ACPI0001" * Return: 0: no _HID and no _CID * -EINVAL: output error * -ENOMEM: output is truncated */ static int create_pnp_modalias(struct acpi_device *acpi_dev, char *modalias, int size) { int len; int count; struct acpi_hardware_id *id; /* Avoid unnecessarily loading modules for non present devices. */ if (!acpi_device_is_present(acpi_dev)) return 0; /* * Since we skip ACPI_DT_NAMESPACE_HID from the modalias below, 0 should * be returned if ACPI_DT_NAMESPACE_HID is the only ACPI/PNP ID in the * device's list. */ count = 0; list_for_each_entry(id, &acpi_dev->pnp.ids, list) if (strcmp(id->id, ACPI_DT_NAMESPACE_HID)) count++; if (!count) return 0; len = snprintf(modalias, size, "acpi:"); if (len >= size) return -ENOMEM; size -= len; list_for_each_entry(id, &acpi_dev->pnp.ids, list) { if (!strcmp(id->id, ACPI_DT_NAMESPACE_HID)) continue; count = snprintf(&modalias[len], size, "%s:", id->id); if (count < 0) return -EINVAL; if (count >= size) return -ENOMEM; len += count; size -= count; } modalias[len] = '\0'; return len; } /** * create_of_modalias - Creates DT compatible string for modalias and uevent * @acpi_dev: ACPI device object. * @modalias: Buffer to print into. * @size: Size of the buffer. * * Expose DT compatible modalias as of:NnameTCcompatible. This function should * only be called for devices having ACPI_DT_NAMESPACE_HID in their list of * ACPI/PNP IDs. */ static int create_of_modalias(struct acpi_device *acpi_dev, char *modalias, int size) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; const union acpi_object *of_compatible, *obj; acpi_status status; int len, count; int i, nval; char *c; status = acpi_get_name(acpi_dev->handle, ACPI_SINGLE_NAME, &buf); if (ACPI_FAILURE(status)) return -ENODEV; /* DT strings are all in lower case */ for (c = buf.pointer; *c != '\0'; c++) *c = tolower(*c); len = snprintf(modalias, size, "of:N%sT", (char *)buf.pointer); ACPI_FREE(buf.pointer); if (len >= size) return -ENOMEM; size -= len; of_compatible = acpi_dev->data.of_compatible; if (of_compatible->type == ACPI_TYPE_PACKAGE) { nval = of_compatible->package.count; obj = of_compatible->package.elements; } else { /* Must be ACPI_TYPE_STRING. */ nval = 1; obj = of_compatible; } for (i = 0; i < nval; i++, obj++) { count = snprintf(&modalias[len], size, "C%s", obj->string.pointer); if (count < 0) return -EINVAL; if (count >= size) return -ENOMEM; len += count; size -= count; } modalias[len] = '\0'; return len; } int __acpi_device_uevent_modalias(struct acpi_device *adev, struct kobj_uevent_env *env) { int len; if (!adev) return -ENODEV; if (list_empty(&adev->pnp.ids)) return 0; if (add_uevent_var(env, "MODALIAS=")) return -ENOMEM; if (adev->data.of_compatible) len = create_of_modalias(adev, &env->buf[env->buflen - 1], sizeof(env->buf) - env->buflen); else len = create_pnp_modalias(adev, &env->buf[env->buflen - 1], sizeof(env->buf) - env->buflen); if (len < 0) return len; env->buflen += len; return 0; } /** * acpi_device_uevent_modalias - uevent modalias for ACPI-enumerated devices. * @dev: Struct device to get ACPI device node. * @env: Environment variables of the kobject uevent. * * Create the uevent modalias field for ACPI-enumerated devices. * * Because other buses do not support ACPI HIDs & CIDs, e.g. for a device with * hid:IBM0001 and cid:ACPI0001 you get: "acpi:IBM0001:ACPI0001". */ int acpi_device_uevent_modalias(struct device *dev, struct kobj_uevent_env *env) { return __acpi_device_uevent_modalias(acpi_companion_match(dev), env); } EXPORT_SYMBOL_GPL(acpi_device_uevent_modalias); static int __acpi_device_modalias(struct acpi_device *adev, char *buf, int size) { int len, count; if (!adev) return -ENODEV; if (list_empty(&adev->pnp.ids)) return 0; len = create_pnp_modalias(adev, buf, size - 1); if (len < 0) { return len; } else if (len > 0) { buf[len++] = '\n'; size -= len; } if (!adev->data.of_compatible) return len; count = create_of_modalias(adev, buf + len, size - 1); if (count < 0) { return count; } else if (count > 0) { len += count; buf[len++] = '\n'; } return len; } /** * acpi_device_modalias - modalias sysfs attribute for ACPI-enumerated devices. * @dev: Struct device to get ACPI device node. * @buf: The buffer to save pnp_modalias and of_modalias. * @size: Size of buffer. * * Create the modalias sysfs attribute for ACPI-enumerated devices. * * Because other buses do not support ACPI HIDs & CIDs, e.g. for a device with * hid:IBM0001 and cid:ACPI0001 you get: "acpi:IBM0001:ACPI0001". */ int acpi_device_modalias(struct device *dev, char *buf, int size) { return __acpi_device_modalias(acpi_companion_match(dev), buf, size); } EXPORT_SYMBOL_GPL(acpi_device_modalias); static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { return __acpi_device_modalias(to_acpi_device(dev), buf, 1024); } static DEVICE_ATTR_RO(modalias); static ssize_t real_power_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *adev = to_acpi_device(dev); int state; int ret; ret = acpi_device_get_power(adev, &state); if (ret) return ret; return sprintf(buf, "%s\n", acpi_power_state_string(state)); } static DEVICE_ATTR_RO(real_power_state); static ssize_t power_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *adev = to_acpi_device(dev); return sprintf(buf, "%s\n", acpi_power_state_string(adev->power.state)); } static DEVICE_ATTR_RO(power_state); static ssize_t eject_store(struct device *d, struct device_attribute *attr, const char *buf, size_t count) { struct acpi_device *acpi_device = to_acpi_device(d); acpi_object_type not_used; acpi_status status; if (!count || buf[0] != '1') return -EINVAL; if ((!acpi_device->handler || !acpi_device->handler->hotplug.enabled) && !acpi_device->driver) return -ENODEV; status = acpi_get_type(acpi_device->handle, &not_used); if (ACPI_FAILURE(status) || !acpi_device->flags.ejectable) return -ENODEV; acpi_dev_get(acpi_device); status = acpi_hotplug_schedule(acpi_device, ACPI_OST_EC_OSPM_EJECT); if (ACPI_SUCCESS(status)) return count; acpi_dev_put(acpi_device); acpi_evaluate_ost(acpi_device->handle, ACPI_OST_EC_OSPM_EJECT, ACPI_OST_SC_NON_SPECIFIC_FAILURE, NULL); return status == AE_NO_MEMORY ? -ENOMEM : -EAGAIN; } static DEVICE_ATTR_WO(eject); static ssize_t hid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); return sprintf(buf, "%s\n", acpi_device_hid(acpi_dev)); } static DEVICE_ATTR_RO(hid); static ssize_t uid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); return sprintf(buf, "%s\n", acpi_dev->pnp.unique_id); } static DEVICE_ATTR_RO(uid); static ssize_t adr_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); if (acpi_dev->pnp.bus_address > U32_MAX) return sprintf(buf, "0x%016llx\n", acpi_dev->pnp.bus_address); else return sprintf(buf, "0x%08llx\n", acpi_dev->pnp.bus_address); } static DEVICE_ATTR_RO(adr); static ssize_t path_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); return acpi_object_path(acpi_dev->handle, buf); } static DEVICE_ATTR_RO(path); /* sysfs file that shows description text from the ACPI _STR method */ static ssize_t description_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); int result; if (acpi_dev->pnp.str_obj == NULL) return 0; /* * The _STR object contains a Unicode identifier for a device. * We need to convert to utf-8 so it can be displayed. */ result = utf16s_to_utf8s( (wchar_t *)acpi_dev->pnp.str_obj->buffer.pointer, acpi_dev->pnp.str_obj->buffer.length, UTF16_LITTLE_ENDIAN, buf, PAGE_SIZE - 1); buf[result++] = '\n'; return result; } static DEVICE_ATTR_RO(description); static ssize_t sun_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); acpi_status status; unsigned long long sun; status = acpi_evaluate_integer(acpi_dev->handle, "_SUN", NULL, &sun); if (ACPI_FAILURE(status)) return -EIO; return sprintf(buf, "%llu\n", sun); } static DEVICE_ATTR_RO(sun); static ssize_t hrv_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); acpi_status status; unsigned long long hrv; status = acpi_evaluate_integer(acpi_dev->handle, "_HRV", NULL, &hrv); if (ACPI_FAILURE(status)) return -EIO; return sprintf(buf, "%llu\n", hrv); } static DEVICE_ATTR_RO(hrv); static ssize_t status_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); acpi_status status; unsigned long long sta; status = acpi_evaluate_integer(acpi_dev->handle, "_STA", NULL, &sta); if (ACPI_FAILURE(status)) return -EIO; return sprintf(buf, "%llu\n", sta); } static DEVICE_ATTR_RO(status); /** * acpi_device_setup_files - Create sysfs attributes of an ACPI device. * @dev: ACPI device object. */ int acpi_device_setup_files(struct acpi_device *dev) { struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; acpi_status status; int result = 0; /* * Devices gotten from FADT don't have a "path" attribute */ if (dev->handle) { result = device_create_file(&dev->dev, &dev_attr_path); if (result) goto end; } if (!list_empty(&dev->pnp.ids)) { result = device_create_file(&dev->dev, &dev_attr_hid); if (result) goto end; result = device_create_file(&dev->dev, &dev_attr_modalias); if (result) goto end; } /* * If device has _STR, 'description' file is created */ if (acpi_has_method(dev->handle, "_STR")) { status = acpi_evaluate_object_typed(dev->handle, "_STR", NULL, &buffer, ACPI_TYPE_BUFFER); if (ACPI_FAILURE(status)) buffer.pointer = NULL; dev->pnp.str_obj = buffer.pointer; result = device_create_file(&dev->dev, &dev_attr_description); if (result) goto end; } if (dev->pnp.type.bus_address) result = device_create_file(&dev->dev, &dev_attr_adr); if (dev->pnp.unique_id) result = device_create_file(&dev->dev, &dev_attr_uid); if (acpi_has_method(dev->handle, "_SUN")) { result = device_create_file(&dev->dev, &dev_attr_sun); if (result) goto end; } if (acpi_has_method(dev->handle, "_HRV")) { result = device_create_file(&dev->dev, &dev_attr_hrv); if (result) goto end; } if (acpi_has_method(dev->handle, "_STA")) { result = device_create_file(&dev->dev, &dev_attr_status); if (result) goto end; } /* * If device has _EJ0, 'eject' file is created that is used to trigger * hot-removal function from userland. */ if (acpi_has_method(dev->handle, "_EJ0")) { result = device_create_file(&dev->dev, &dev_attr_eject); if (result) return result; } if (dev->flags.power_manageable) { result = device_create_file(&dev->dev, &dev_attr_power_state); if (result) return result; if (dev->power.flags.power_resources) result = device_create_file(&dev->dev, &dev_attr_real_power_state); } acpi_expose_nondev_subnodes(&dev->dev.kobj, &dev->data); end: return result; } /** * acpi_device_remove_files - Remove sysfs attributes of an ACPI device. * @dev: ACPI device object. */ void acpi_device_remove_files(struct acpi_device *dev) { acpi_hide_nondev_subnodes(&dev->data); if (dev->flags.power_manageable) { device_remove_file(&dev->dev, &dev_attr_power_state); if (dev->power.flags.power_resources) device_remove_file(&dev->dev, &dev_attr_real_power_state); } /* * If device has _STR, remove 'description' file */ if (acpi_has_method(dev->handle, "_STR")) { kfree(dev->pnp.str_obj); device_remove_file(&dev->dev, &dev_attr_description); } /* * If device has _EJ0, remove 'eject' file. */ if (acpi_has_method(dev->handle, "_EJ0")) device_remove_file(&dev->dev, &dev_attr_eject); if (acpi_has_method(dev->handle, "_SUN")) device_remove_file(&dev->dev, &dev_attr_sun); if (acpi_has_method(dev->handle, "_HRV")) device_remove_file(&dev->dev, &dev_attr_hrv); if (dev->pnp.unique_id) device_remove_file(&dev->dev, &dev_attr_uid); if (dev->pnp.type.bus_address) device_remove_file(&dev->dev, &dev_attr_adr); device_remove_file(&dev->dev, &dev_attr_modalias); device_remove_file(&dev->dev, &dev_attr_hid); if (acpi_has_method(dev->handle, "_STA")) device_remove_file(&dev->dev, &dev_attr_status); if (dev->handle) device_remove_file(&dev->dev, &dev_attr_path); }
1 1 1 20 21 21 2 2 21 11 11 1 10 10 5 5 5 5 10 19 19 3 10 2 4 4 10 4 10 21 21 1 1 2 2 2 2 15 4 19 5 14 14 14 12 2 12 9 3 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one * Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel). * Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> #define HTSIZE 256 struct fw_head { u32 mask; struct fw_filter __rcu *ht[HTSIZE]; struct rcu_head rcu; }; struct fw_filter { struct fw_filter __rcu *next; u32 id; struct tcf_result res; int ifindex; struct tcf_exts exts; struct tcf_proto *tp; struct rcu_work rwork; }; static u32 fw_hash(u32 handle) { handle ^= (handle >> 16); handle ^= (handle >> 8); return handle % HTSIZE; } static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct fw_head *head = rcu_dereference_bh(tp->root); struct fw_filter *f; int r; u32 id = skb->mark; if (head != NULL) { id &= head->mask; for (f = rcu_dereference_bh(head->ht[fw_hash(id)]); f; f = rcu_dereference_bh(f->next)) { if (f->id == id) { *res = f->res; if (!tcf_match_indev(skb, f->ifindex)) continue; r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; return r; } } } else { struct Qdisc *q = tcf_block_q(tp->chain->block); /* Old method: classify the packet using its skb mark. */ if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id ^ q->handle)))) { res->classid = id; res->class = 0; return 0; } } return -1; } static void *fw_get(struct tcf_proto *tp, u32 handle) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; if (head == NULL) return NULL; f = rtnl_dereference(head->ht[fw_hash(handle)]); for (; f; f = rtnl_dereference(f->next)) { if (f->id == handle) return f; } return NULL; } static int fw_init(struct tcf_proto *tp) { /* We don't allocate fw_head here, because in the old method * we don't need it at all. */ return 0; } static void __fw_delete_filter(struct fw_filter *f) { tcf_exts_destroy(&f->exts); tcf_exts_put_net(&f->exts); kfree(f); } static void fw_delete_filter_work(struct work_struct *work) { struct fw_filter *f = container_of(to_rcu_work(work), struct fw_filter, rwork); rtnl_lock(); __fw_delete_filter(f); rtnl_unlock(); } static void fw_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; int h; if (head == NULL) return; for (h = 0; h < HTSIZE; h++) { while ((f = rtnl_dereference(head->ht[h])) != NULL) { RCU_INIT_POINTER(head->ht[h], rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); if (tcf_exts_get_net(&f->exts)) tcf_queue_work(&f->rwork, fw_delete_filter_work); else __fw_delete_filter(f); } } kfree_rcu(head, rcu); } static int fw_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = arg; struct fw_filter __rcu **fp; struct fw_filter *pfp; int ret = -EINVAL; int h; if (head == NULL || f == NULL) goto out; fp = &head->ht[fw_hash(f->id)]; for (pfp = rtnl_dereference(*fp); pfp; fp = &pfp->next, pfp = rtnl_dereference(*fp)) { if (pfp == f) { RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); tcf_queue_work(&f->rwork, fw_delete_filter_work); ret = 0; break; } } *last = true; for (h = 0; h < HTSIZE; h++) { if (rcu_access_pointer(head->ht[h])) { *last = false; break; } } out: return ret; } static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { [TCA_FW_CLASSID] = { .type = NLA_U32 }, [TCA_FW_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_FW_MASK] = { .type = NLA_U32 }, }; static int fw_set_parms(struct net *net, struct tcf_proto *tp, struct fw_filter *f, struct nlattr **tb, struct nlattr **tca, unsigned long base, u32 flags, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); u32 mask; int err; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, flags, extack); if (err < 0) return err; if (tb[TCA_FW_INDEV]) { int ret; ret = tcf_change_indev(net, tb[TCA_FW_INDEV], extack); if (ret < 0) return ret; f->ifindex = ret; } err = -EINVAL; if (tb[TCA_FW_MASK]) { mask = nla_get_u32(tb[TCA_FW_MASK]); if (mask != head->mask) return err; } else if (head->mask != 0xFFFFFFFF) return err; if (tb[TCA_FW_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); tcf_bind_filter(tp, &f->res, base); } return 0; } static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = *arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FW_MAX + 1]; int err; if (!opt) return handle ? -EINVAL : 0; /* Succeed if it is old method. */ err = nla_parse_nested_deprecated(tb, TCA_FW_MAX, opt, fw_policy, NULL); if (err < 0) return err; if (f) { struct fw_filter *pfp, *fnew; struct fw_filter __rcu **fp; if (f->id != handle && handle) return -EINVAL; fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); if (!fnew) return -ENOBUFS; fnew->id = f->id; fnew->ifindex = f->ifindex; fnew->tp = f->tp; err = tcf_exts_init(&fnew->exts, net, TCA_FW_ACT, TCA_FW_POLICE); if (err < 0) { kfree(fnew); return err; } err = fw_set_parms(net, tp, fnew, tb, tca, base, flags, extack); if (err < 0) { tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } fp = &head->ht[fw_hash(fnew->id)]; for (pfp = rtnl_dereference(*fp); pfp; fp = &pfp->next, pfp = rtnl_dereference(*fp)) if (pfp == f) break; RCU_INIT_POINTER(fnew->next, rtnl_dereference(pfp->next)); rcu_assign_pointer(*fp, fnew); tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); tcf_queue_work(&f->rwork, fw_delete_filter_work); *arg = fnew; return err; } if (!handle) return -EINVAL; if (!head) { u32 mask = 0xFFFFFFFF; if (tb[TCA_FW_MASK]) mask = nla_get_u32(tb[TCA_FW_MASK]); head = kzalloc(sizeof(*head), GFP_KERNEL); if (!head) return -ENOBUFS; head->mask = mask; rcu_assign_pointer(tp->root, head); } f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); if (f == NULL) return -ENOBUFS; err = tcf_exts_init(&f->exts, net, TCA_FW_ACT, TCA_FW_POLICE); if (err < 0) goto errout; f->id = handle; f->tp = tp; err = fw_set_parms(net, tp, f, tb, tca, base, flags, extack); if (err < 0) goto errout; RCU_INIT_POINTER(f->next, head->ht[fw_hash(handle)]); rcu_assign_pointer(head->ht[fw_hash(handle)], f); *arg = f; return 0; errout: tcf_exts_destroy(&f->exts); kfree(f); return err; } static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct fw_head *head = rtnl_dereference(tp->root); int h; if (head == NULL) arg->stop = 1; if (arg->stop) return; for (h = 0; h < HTSIZE; h++) { struct fw_filter *f; for (f = rtnl_dereference(head->ht[h]); f; f = rtnl_dereference(f->next)) { if (arg->count < arg->skip) { arg->count++; continue; } if (arg->fn(tp, f, arg) < 0) { arg->stop = 1; return; } arg->count++; } } } static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = fh; struct nlattr *nest; if (f == NULL) return skb->len; t->tcm_handle = f->id; if (!f->res.classid && !tcf_exts_has_actions(&f->exts)) return skb->len; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (f->res.classid && nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid)) goto nla_put_failure; if (f->ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, f->ifindex); if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name)) goto nla_put_failure; } if (head->mask != 0xFFFFFFFF && nla_put_u32(skb, TCA_FW_MASK, head->mask)) goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q, unsigned long base) { struct fw_filter *f = fh; if (f && f->res.classid == classid) { if (cl) __tcf_bind_filter(q, &f->res, base); else __tcf_unbind_filter(q, &f->res); } } static struct tcf_proto_ops cls_fw_ops __read_mostly = { .kind = "fw", .classify = fw_classify, .init = fw_init, .destroy = fw_destroy, .get = fw_get, .change = fw_change, .delete = fw_delete, .walk = fw_walk, .dump = fw_dump, .bind_class = fw_bind_class, .owner = THIS_MODULE, }; static int __init init_fw(void) { return register_tcf_proto_ops(&cls_fw_ops); } static void __exit exit_fw(void) { unregister_tcf_proto_ops(&cls_fw_ops); } module_init(init_fw) module_exit(exit_fw) MODULE_LICENSE("GPL");
3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 // SPDX-License-Identifier: GPL-2.0-or-later /* Peer event handling, typically ICMP messages. * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/errqueue.h> #include <linux/udp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/icmp.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> #include <net/icmp.h> #include "ar-internal.h" static void rxrpc_adjust_mtu(struct rxrpc_peer *, unsigned int); static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); static void rxrpc_distribute_error(struct rxrpc_peer *, int, enum rxrpc_call_completion); /* * Find the peer associated with an ICMPv4 packet. */ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, struct sk_buff *skb, unsigned int udp_offset, unsigned int *info, struct sockaddr_rxrpc *srx) { struct iphdr *ip, *ip0 = ip_hdr(skb); struct icmphdr *icmp = icmp_hdr(skb); struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); _enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code); switch (icmp->type) { case ICMP_DEST_UNREACH: *info = ntohs(icmp->un.frag.mtu); fallthrough; case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: ip = (struct iphdr *)((void *)icmp + 8); break; default: return NULL; } memset(srx, 0, sizeof(*srx)); srx->transport_type = local->srx.transport_type; srx->transport_len = local->srx.transport_len; srx->transport.family = local->srx.transport.family; /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice * versa? */ switch (srx->transport.family) { case AF_INET: srx->transport_len = sizeof(srx->transport.sin); srx->transport.family = AF_INET; srx->transport.sin.sin_port = udp->dest; memcpy(&srx->transport.sin.sin_addr, &ip->daddr, sizeof(struct in_addr)); break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: srx->transport_len = sizeof(srx->transport.sin); srx->transport.family = AF_INET; srx->transport.sin.sin_port = udp->dest; memcpy(&srx->transport.sin.sin_addr, &ip->daddr, sizeof(struct in_addr)); break; #endif default: WARN_ON_ONCE(1); return NULL; } _net("ICMP {%pISp}", &srx->transport); return rxrpc_lookup_peer_rcu(local, srx); } #ifdef CONFIG_AF_RXRPC_IPV6 /* * Find the peer associated with an ICMPv6 packet. */ static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local, struct sk_buff *skb, unsigned int udp_offset, unsigned int *info, struct sockaddr_rxrpc *srx) { struct icmp6hdr *icmp = icmp6_hdr(skb); struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb); struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); _enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code); switch (icmp->icmp6_type) { case ICMPV6_DEST_UNREACH: *info = ntohl(icmp->icmp6_mtu); fallthrough; case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: ip = (struct ipv6hdr *)((void *)icmp + 8); break; default: return NULL; } memset(srx, 0, sizeof(*srx)); srx->transport_type = local->srx.transport_type; srx->transport_len = local->srx.transport_len; srx->transport.family = local->srx.transport.family; /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice * versa? */ switch (srx->transport.family) { case AF_INET: _net("Rx ICMP6 on v4 sock"); srx->transport_len = sizeof(srx->transport.sin); srx->transport.family = AF_INET; srx->transport.sin.sin_port = udp->dest; memcpy(&srx->transport.sin.sin_addr, &ip->daddr.s6_addr32[3], sizeof(struct in_addr)); break; case AF_INET6: _net("Rx ICMP6"); srx->transport.sin.sin_port = udp->dest; memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr, sizeof(struct in6_addr)); break; default: WARN_ON_ONCE(1); return NULL; } _net("ICMP {%pISp}", &srx->transport); return rxrpc_lookup_peer_rcu(local, srx); } #endif /* CONFIG_AF_RXRPC_IPV6 */ /* * Handle an error received on the local endpoint as a tunnel. */ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset) { struct sock_extended_err ee; struct sockaddr_rxrpc srx; struct rxrpc_local *local; struct rxrpc_peer *peer; unsigned int info = 0; int err; u8 version = ip_hdr(skb)->version; u8 type = icmp_hdr(skb)->type; u8 code = icmp_hdr(skb)->code; rcu_read_lock(); local = rcu_dereference_sk_user_data(sk); if (unlikely(!local)) { rcu_read_unlock(); return; } rxrpc_new_skb(skb, rxrpc_skb_received); switch (ip_hdr(skb)->version) { case IPVERSION: peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset, &info, &srx); break; #ifdef CONFIG_AF_RXRPC_IPV6 case 6: peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset, &info, &srx); break; #endif default: rcu_read_unlock(); return; } if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; if (!peer) { rcu_read_unlock(); return; } memset(&ee, 0, sizeof(ee)); switch (version) { case IPVERSION: switch (type) { case ICMP_DEST_UNREACH: switch (code) { case ICMP_FRAG_NEEDED: rxrpc_adjust_mtu(peer, info); rcu_read_unlock(); rxrpc_put_peer(peer); return; default: break; } err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) { /* Might want to do something different with * non-fatal errors */ //harderr = icmp_err_convert[code].fatal; err = icmp_err_convert[code].errno; } break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; default: err = EPROTO; break; } ee.ee_origin = SO_EE_ORIGIN_ICMP; ee.ee_type = type; ee.ee_code = code; ee.ee_errno = err; break; #ifdef CONFIG_AF_RXRPC_IPV6 case 6: switch (type) { case ICMPV6_PKT_TOOBIG: rxrpc_adjust_mtu(peer, info); rcu_read_unlock(); rxrpc_put_peer(peer); return; } icmpv6_err_convert(type, code, &err); if (err == EACCES) err = EHOSTUNREACH; ee.ee_origin = SO_EE_ORIGIN_ICMP6; ee.ee_type = type; ee.ee_code = code; ee.ee_errno = err; break; #endif } trace_rxrpc_rx_icmp(peer, &ee, &srx); rxrpc_distribute_error(peer, err, RXRPC_CALL_NETWORK_ERROR); rcu_read_unlock(); rxrpc_put_peer(peer); } /* * Find the peer associated with a local error. */ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, const struct sk_buff *skb, struct sockaddr_rxrpc *srx) { struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); _enter(""); memset(srx, 0, sizeof(*srx)); srx->transport_type = local->srx.transport_type; srx->transport_len = local->srx.transport_len; srx->transport.family = local->srx.transport.family; switch (srx->transport.family) { case AF_INET: srx->transport_len = sizeof(srx->transport.sin); srx->transport.family = AF_INET; srx->transport.sin.sin_port = serr->port; switch (serr->ee.ee_origin) { case SO_EE_ORIGIN_ICMP: _net("Rx ICMP"); memcpy(&srx->transport.sin.sin_addr, skb_network_header(skb) + serr->addr_offset, sizeof(struct in_addr)); break; case SO_EE_ORIGIN_ICMP6: _net("Rx ICMP6 on v4 sock"); memcpy(&srx->transport.sin.sin_addr, skb_network_header(skb) + serr->addr_offset + 12, sizeof(struct in_addr)); break; default: memcpy(&srx->transport.sin.sin_addr, &ip_hdr(skb)->saddr, sizeof(struct in_addr)); break; } break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: switch (serr->ee.ee_origin) { case SO_EE_ORIGIN_ICMP6: _net("Rx ICMP6"); srx->transport.sin6.sin6_port = serr->port; memcpy(&srx->transport.sin6.sin6_addr, skb_network_header(skb) + serr->addr_offset, sizeof(struct in6_addr)); break; case SO_EE_ORIGIN_ICMP: _net("Rx ICMP on v6 sock"); srx->transport_len = sizeof(srx->transport.sin); srx->transport.family = AF_INET; srx->transport.sin.sin_port = serr->port; memcpy(&srx->transport.sin.sin_addr, skb_network_header(skb) + serr->addr_offset, sizeof(struct in_addr)); break; default: memcpy(&srx->transport.sin6.sin6_addr, &ipv6_hdr(skb)->saddr, sizeof(struct in6_addr)); break; } break; #endif default: BUG(); } return rxrpc_lookup_peer_rcu(local, srx); } /* * Handle an MTU/fragmentation problem. */ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { _net("Rx ICMP Fragmentation Needed (%d)", mtu); /* wind down the local interface MTU */ if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) { peer->if_mtu = mtu; _net("I/F MTU %u", mtu); } if (mtu == 0) { /* they didn't give us a size, estimate one */ mtu = peer->if_mtu; if (mtu > 1500) { mtu >>= 1; if (mtu < 1500) mtu = 1500; } else { mtu -= 100; if (mtu < peer->hdrsize) mtu = peer->hdrsize + 4; } } if (mtu < peer->mtu) { spin_lock_bh(&peer->lock); peer->mtu = mtu; peer->maxdata = peer->mtu - peer->hdrsize; spin_unlock_bh(&peer->lock); _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); } } /* * Handle an error received on the local endpoint. */ void rxrpc_error_report(struct sock *sk) { struct sock_exterr_skb *serr; struct sockaddr_rxrpc srx; struct rxrpc_local *local; struct rxrpc_peer *peer = NULL; struct sk_buff *skb; rcu_read_lock(); local = rcu_dereference_sk_user_data(sk); if (unlikely(!local)) { rcu_read_unlock(); return; } _enter("%p{%d}", sk, local->debug_id); /* Clear the outstanding error value on the socket so that it doesn't * cause kernel_sendmsg() to return it later. */ sock_error(sk); skb = sock_dequeue_err_skb(sk); if (!skb) { rcu_read_unlock(); _leave("UDP socket errqueue empty"); return; } rxrpc_new_skb(skb, rxrpc_skb_received); serr = SKB_EXT_ERR(skb); if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) { peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; if (peer) { trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); rxrpc_store_error(peer, serr); } } rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_freed); rxrpc_put_peer(peer); _leave(""); } /* * Map an error report to error codes on the peer record. */ static void rxrpc_store_error(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) { enum rxrpc_call_completion compl = RXRPC_CALL_NETWORK_ERROR; struct sock_extended_err *ee; int err; _enter(""); ee = &serr->ee; err = ee->ee_errno; switch (ee->ee_origin) { case SO_EE_ORIGIN_ICMP: switch (ee->ee_type) { case ICMP_DEST_UNREACH: switch (ee->ee_code) { case ICMP_NET_UNREACH: _net("Rx Received ICMP Network Unreachable"); break; case ICMP_HOST_UNREACH: _net("Rx Received ICMP Host Unreachable"); break; case ICMP_PORT_UNREACH: _net("Rx Received ICMP Port Unreachable"); break; case ICMP_NET_UNKNOWN: _net("Rx Received ICMP Unknown Network"); break; case ICMP_HOST_UNKNOWN: _net("Rx Received ICMP Unknown Host"); break; default: _net("Rx Received ICMP DestUnreach code=%u", ee->ee_code); break; } break; case ICMP_TIME_EXCEEDED: _net("Rx Received ICMP TTL Exceeded"); break; default: _proto("Rx Received ICMP error { type=%u code=%u }", ee->ee_type, ee->ee_code); break; } break; case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_LOCAL: _proto("Rx Received local error { error=%d }", err); compl = RXRPC_CALL_LOCAL_ERROR; break; case SO_EE_ORIGIN_ICMP6: if (err == EACCES) err = EHOSTUNREACH; fallthrough; default: _proto("Rx Received error report { orig=%u }", ee->ee_origin); break; } rxrpc_distribute_error(peer, err, compl); } /* * Distribute an error that occurred on a peer. */ static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, enum rxrpc_call_completion compl) { struct rxrpc_call *call; hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) { rxrpc_see_call(call); rxrpc_set_call_completion(call, compl, 0, -error); } } /* * Perform keep-alive pings. */ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, struct list_head *collector, time64_t base, u8 cursor) { struct rxrpc_peer *peer; const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1; time64_t keepalive_at; int slot; spin_lock_bh(&rxnet->peer_hash_lock); while (!list_empty(collector)) { peer = list_entry(collector->next, struct rxrpc_peer, keepalive_link); list_del_init(&peer->keepalive_link); if (!rxrpc_get_peer_maybe(peer)) continue; if (__rxrpc_use_local(peer->local)) { spin_unlock_bh(&rxnet->peer_hash_lock); keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; slot = keepalive_at - base; _debug("%02x peer %u t=%d {%pISp}", cursor, peer->debug_id, slot, &peer->srx.transport); if (keepalive_at <= base || keepalive_at > base + RXRPC_KEEPALIVE_TIME) { rxrpc_send_keepalive(peer); slot = RXRPC_KEEPALIVE_TIME; } /* A transmission to this peer occurred since last we * examined it so put it into the appropriate future * bucket. */ slot += cursor; slot &= mask; spin_lock_bh(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); rxrpc_unuse_local(peer->local); } rxrpc_put_peer_locked(peer); } spin_unlock_bh(&rxnet->peer_hash_lock); } /* * Perform keep-alive pings with VERSION packets to keep any NAT alive. */ void rxrpc_peer_keepalive_worker(struct work_struct *work) { struct rxrpc_net *rxnet = container_of(work, struct rxrpc_net, peer_keepalive_work); const u8 mask = ARRAY_SIZE(rxnet->peer_keepalive) - 1; time64_t base, now, delay; u8 cursor, stop; LIST_HEAD(collector); now = ktime_get_seconds(); base = rxnet->peer_keepalive_base; cursor = rxnet->peer_keepalive_cursor; _enter("%lld,%u", base - now, cursor); if (!rxnet->live) return; /* Remove to a temporary list all the peers that are currently lodged * in expired buckets plus all new peers. * * Everything in the bucket at the cursor is processed this * second; the bucket at cursor + 1 goes at now + 1s and so * on... */ spin_lock_bh(&rxnet->peer_hash_lock); list_splice_init(&rxnet->peer_keepalive_new, &collector); stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive); while (base <= now && (s8)(cursor - stop) < 0) { list_splice_tail_init(&rxnet->peer_keepalive[cursor & mask], &collector); base++; cursor++; } base = now; spin_unlock_bh(&rxnet->peer_hash_lock); rxnet->peer_keepalive_base = base; rxnet->peer_keepalive_cursor = cursor; rxrpc_peer_keepalive_dispatch(rxnet, &collector, base, cursor); ASSERT(list_empty(&collector)); /* Schedule the timer for the next occupied timeslot. */ cursor = rxnet->peer_keepalive_cursor; stop = cursor + RXRPC_KEEPALIVE_TIME - 1; for (; (s8)(cursor - stop) < 0; cursor++) { if (!list_empty(&rxnet->peer_keepalive[cursor & mask])) break; base++; } now = ktime_get_seconds(); delay = base - now; if (delay < 1) delay = 1; delay *= HZ; if (rxnet->live) timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay); _leave(""); }
4982 23 40 4257 113 2 1194 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_BIT_H #define _LINUX_WAIT_BIT_H /* * Linux wait-bit related types and methods: */ #include <linux/wait.h> struct wait_bit_key { void *flags; int bit_nr; unsigned long timeout; }; struct wait_bit_queue_entry { struct wait_bit_key key; struct wait_queue_entry wq_entry; }; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit); int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); void wake_up_bit(void *word, int bit); int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode); int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode); struct wait_queue_head *bit_waitqueue(void *word, int bit); extern void __init wait_bit_init(void); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_BIT(name, word, bit) \ struct wait_bit_queue_entry name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ .wq_entry = { \ .private = current, \ .func = wake_bit_function, \ .entry = \ LIST_HEAD_INIT((name).wq_entry.entry), \ }, \ } extern int bit_wait(struct wait_bit_key *key, int mode); extern int bit_wait_io(struct wait_bit_key *key, int mode); extern int bit_wait_timeout(struct wait_bit_key *key, int mode); extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode); /** * wait_on_bit - wait for a bit to be cleared * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @mode: the task state to sleep in * * There is a standard hashed waitqueue table for generic use. This * is the part of the hashtable's accessor API that waits on a bit. * For instance, if one were to have waiters on a bitflag, one would * call wait_on_bit() in threads waiting for the bit to clear. * One uses wait_on_bit() where one is waiting for the bit to clear, * but has no intention of setting it. * Returned value will be zero if the bit was cleared, or non-zero * if the process received a signal and the mode permitted wakeup * on that signal. */ static inline int wait_on_bit(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait, mode); } /** * wait_on_bit_io - wait for a bit to be cleared * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @mode: the task state to sleep in * * Use the standard hashed waitqueue table to wait for a bit * to be cleared. This is similar to wait_on_bit(), but calls * io_schedule() instead of schedule() for the actual waiting. * * Returned value will be zero if the bit was cleared, or non-zero * if the process received a signal and the mode permitted wakeup * on that signal. */ static inline int wait_on_bit_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, bit_wait_io, mode); } /** * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @mode: the task state to sleep in * @timeout: timeout, in jiffies * * Use the standard hashed waitqueue table to wait for a bit * to be cleared. This is similar to wait_on_bit(), except also takes a * timeout parameter. * * Returned value will be zero if the bit was cleared before the * @timeout elapsed, or non-zero if the @timeout elapsed or process * received a signal and the mode permitted wakeup on that signal. */ static inline int wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, unsigned long timeout) { might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit_timeout(word, bit, bit_wait_timeout, mode, timeout); } /** * wait_on_bit_action - wait for a bit to be cleared * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * Use the standard hashed waitqueue table to wait for a bit * to be cleared, and allow the waiting action to be specified. * This is like wait_on_bit() but allows fine control of how the waiting * is done. * * Returned value will be zero if the bit was cleared, or non-zero * if the process received a signal and the mode permitted wakeup * on that signal. */ static inline int wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_bit(bit, word)) return 0; return out_of_line_wait_on_bit(word, bit, action, mode); } /** * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @mode: the task state to sleep in * * There is a standard hashed waitqueue table for generic use. This * is the part of the hashtable's accessor API that waits on a bit * when one intends to set it, for instance, trying to lock bitflags. * For instance, if one were to have waiters trying to set bitflag * and waiting for it to clear before setting it, one would call * wait_on_bit() in threads waiting to be able to set the bit. * One uses wait_on_bit_lock() where one is waiting for the bit to * clear with the intention of setting it, and when done, clearing it. * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and * the @mode allows that signal to wake the process. */ static inline int wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); } /** * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @mode: the task state to sleep in * * Use the standard hashed waitqueue table to wait for a bit * to be cleared and then to atomically set it. This is similar * to wait_on_bit(), but calls io_schedule() instead of schedule() * for the actual waiting. * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and * the @mode allows that signal to wake the process. */ static inline int wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); } /** * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it * @word: the word being waited on, a kernel virtual address * @bit: the bit of the word being waited on * @action: the function used to sleep, which may take special actions * @mode: the task state to sleep in * * Use the standard hashed waitqueue table to wait for a bit * to be cleared and then to set it, and allow the waiting action * to be specified. * This is like wait_on_bit() but allows fine control of how the waiting * is done. * * Returns zero if the bit was (eventually) found to be clear and was * set. Returns non-zero if a signal was delivered to the process and * the @mode allows that signal to wake the process. */ static inline int wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, unsigned mode) { might_sleep(); if (!test_and_set_bit(bit, word)) return 0; return out_of_line_wait_on_bit_lock(word, bit, action, mode); } extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags); extern void wake_up_var(void *var); extern wait_queue_head_t *__var_waitqueue(void *p); #define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_head *__wq_head = __var_waitqueue(var); \ struct wait_bit_queue_entry __wbq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_var_entry(&__wbq_entry, var, \ exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(__wq_head, \ &__wbq_entry.wq_entry, \ state); \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ } \ finish_wait(__wq_head, &__wbq_entry.wq_entry); \ __out: __ret; \ }) #define __wait_var_event(var, condition) \ ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) #define wait_var_event(var, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_var_event(var, condition); \ } while (0) #define __wait_var_event_killable(var, condition) \ ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ schedule()) #define wait_var_event_killable(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_killable(var, condition); \ __ret; \ }) #define __wait_var_event_timeout(var, condition, timeout) \ ___wait_var_event(var, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) #define wait_var_event_timeout(var, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_var_event_timeout(var, condition, timeout); \ __ret; \ }) #define __wait_var_event_interruptible(var, condition) \ ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) #define wait_var_event_interruptible(var, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_var_event_interruptible(var, condition); \ __ret; \ }) /** * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit * * @bit: the bit of the word being waited on * @word: the word being waited on, a kernel virtual address * * You can use this helper if bitflags are manipulated atomically rather than * non-atomically under a lock. */ static inline void clear_and_wake_up_bit(int bit, void *word) { clear_bit_unlock(bit, word); /* See wake_up_bit() for which memory barrier you need to use. */ smp_mb__after_atomic(); wake_up_bit(word, bit); } #endif /* _LINUX_WAIT_BIT_H */
4 4 4 4 4 4 3 4 4 30 30 10 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 // SPDX-License-Identifier: GPL-2.0-only /* * scsi_pm.c Copyright (C) 2010 Alan Stern * * SCSI dynamic Power Management * Initial version: Alan Stern <stern@rowland.harvard.edu> */ #include <linux/pm_runtime.h> #include <linux/export.h> #include <linux/async.h> #include <linux/blk-pm.h> #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_host.h> #include "scsi_priv.h" #ifdef CONFIG_PM_SLEEP static int do_scsi_suspend(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->suspend ? pm->suspend(dev) : 0; } static int do_scsi_freeze(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->freeze ? pm->freeze(dev) : 0; } static int do_scsi_poweroff(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->poweroff ? pm->poweroff(dev) : 0; } static int do_scsi_resume(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->resume ? pm->resume(dev) : 0; } static int do_scsi_thaw(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->thaw ? pm->thaw(dev) : 0; } static int do_scsi_restore(struct device *dev, const struct dev_pm_ops *pm) { return pm && pm->restore ? pm->restore(dev) : 0; } static int scsi_dev_type_suspend(struct device *dev, int (*cb)(struct device *, const struct dev_pm_ops *)) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int err; /* flush pending in-flight resume operations, suspend is synchronous */ async_synchronize_full_domain(&scsi_sd_pm_domain); err = scsi_device_quiesce(to_scsi_device(dev)); if (err == 0) { err = cb(dev, pm); if (err) scsi_device_resume(to_scsi_device(dev)); } dev_dbg(dev, "scsi suspend: %d\n", err); return err; } static int scsi_dev_type_resume(struct device *dev, int (*cb)(struct device *, const struct dev_pm_ops *)) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int err = 0; err = cb(dev, pm); scsi_device_resume(to_scsi_device(dev)); dev_dbg(dev, "scsi resume: %d\n", err); if (err == 0) { pm_runtime_disable(dev); err = pm_runtime_set_active(dev); pm_runtime_enable(dev); /* * Forcibly set runtime PM status of request queue to "active" * to make sure we can again get requests from the queue * (see also blk_pm_peek_request()). * * The resume hook will correct runtime PM status of the disk. */ if (!err && scsi_is_sdev_device(dev)) { struct scsi_device *sdev = to_scsi_device(dev); blk_set_runtime_active(sdev->request_queue); } } return err; } static int scsi_bus_suspend_common(struct device *dev, int (*cb)(struct device *, const struct dev_pm_ops *)) { int err = 0; if (scsi_is_sdev_device(dev)) { /* * All the high-level SCSI drivers that implement runtime * PM treat runtime suspend, system suspend, and system * hibernate nearly identically. In all cases the requirements * for runtime suspension are stricter. */ if (pm_runtime_suspended(dev)) return 0; err = scsi_dev_type_suspend(dev, cb); } return err; } static void async_sdev_resume(void *dev, async_cookie_t cookie) { scsi_dev_type_resume(dev, do_scsi_resume); } static void async_sdev_thaw(void *dev, async_cookie_t cookie) { scsi_dev_type_resume(dev, do_scsi_thaw); } static void async_sdev_restore(void *dev, async_cookie_t cookie) { scsi_dev_type_resume(dev, do_scsi_restore); } static int scsi_bus_resume_common(struct device *dev, int (*cb)(struct device *, const struct dev_pm_ops *)) { async_func_t fn; if (!scsi_is_sdev_device(dev)) fn = NULL; else if (cb == do_scsi_resume) fn = async_sdev_resume; else if (cb == do_scsi_thaw) fn = async_sdev_thaw; else if (cb == do_scsi_restore) fn = async_sdev_restore; else fn = NULL; if (fn) { async_schedule_domain(fn, dev, &scsi_sd_pm_domain); /* * If a user has disabled async probing a likely reason * is due to a storage enclosure that does not inject * staggered spin-ups. For safety, make resume * synchronous as well in that case. */ if (strncmp(scsi_scan_type, "async", 5) != 0) async_synchronize_full_domain(&scsi_sd_pm_domain); } else { pm_runtime_disable(dev); pm_runtime_set_active(dev); pm_runtime_enable(dev); } return 0; } static int scsi_bus_prepare(struct device *dev) { if (scsi_is_host_device(dev)) { /* Wait until async scanning is finished */ scsi_complete_async_scans(); } return 0; } static int scsi_bus_suspend(struct device *dev) { return scsi_bus_suspend_common(dev, do_scsi_suspend); } static int scsi_bus_resume(struct device *dev) { return scsi_bus_resume_common(dev, do_scsi_resume); } static int scsi_bus_freeze(struct device *dev) { return scsi_bus_suspend_common(dev, do_scsi_freeze); } static int scsi_bus_thaw(struct device *dev) { return scsi_bus_resume_common(dev, do_scsi_thaw); } static int scsi_bus_poweroff(struct device *dev) { return scsi_bus_suspend_common(dev, do_scsi_poweroff); } static int scsi_bus_restore(struct device *dev) { return scsi_bus_resume_common(dev, do_scsi_restore); } #else /* CONFIG_PM_SLEEP */ #define scsi_bus_prepare NULL #define scsi_bus_suspend NULL #define scsi_bus_resume NULL #define scsi_bus_freeze NULL #define scsi_bus_thaw NULL #define scsi_bus_poweroff NULL #define scsi_bus_restore NULL #endif /* CONFIG_PM_SLEEP */ static int sdev_runtime_suspend(struct device *dev) { const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; struct scsi_device *sdev = to_scsi_device(dev); int err = 0; err = blk_pre_runtime_suspend(sdev->request_queue); if (err) return err; if (pm && pm->runtime_suspend) err = pm->runtime_suspend(dev); blk_post_runtime_suspend(sdev->request_queue, err); return err; } static int scsi_runtime_suspend(struct device *dev) { int err = 0; dev_dbg(dev, "scsi_runtime_suspend\n"); if (scsi_is_sdev_device(dev)) err = sdev_runtime_suspend(dev); /* Insert hooks here for targets, hosts, and transport classes */ return err; } static int sdev_runtime_resume(struct device *dev) { struct scsi_device *sdev = to_scsi_device(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; int err = 0; blk_pre_runtime_resume(sdev->request_queue); if (pm && pm->runtime_resume) err = pm->runtime_resume(dev); blk_post_runtime_resume(sdev->request_queue); return err; } static int scsi_runtime_resume(struct device *dev) { int err = 0; dev_dbg(dev, "scsi_runtime_resume\n"); if (scsi_is_sdev_device(dev)) err = sdev_runtime_resume(dev); /* Insert hooks here for targets, hosts, and transport classes */ return err; } static int scsi_runtime_idle(struct device *dev) { dev_dbg(dev, "scsi_runtime_idle\n"); /* Insert hooks here for targets, hosts, and transport classes */ if (scsi_is_sdev_device(dev)) { pm_runtime_mark_last_busy(dev); pm_runtime_autosuspend(dev); return -EBUSY; } return 0; } int scsi_autopm_get_device(struct scsi_device *sdev) { int err; err = pm_runtime_get_sync(&sdev->sdev_gendev); if (err < 0 && err !=-EACCES) pm_runtime_put_sync(&sdev->sdev_gendev); else err = 0; return err; } EXPORT_SYMBOL_GPL(scsi_autopm_get_device); void scsi_autopm_put_device(struct scsi_device *sdev) { pm_runtime_put_sync(&sdev->sdev_gendev); } EXPORT_SYMBOL_GPL(scsi_autopm_put_device); void scsi_autopm_get_target(struct scsi_target *starget) { pm_runtime_get_sync(&starget->dev); } void scsi_autopm_put_target(struct scsi_target *starget) { pm_runtime_put_sync(&starget->dev); } int scsi_autopm_get_host(struct Scsi_Host *shost) { int err; err = pm_runtime_get_sync(&shost->shost_gendev); if (err < 0 && err !=-EACCES) pm_runtime_put_sync(&shost->shost_gendev); else err = 0; return err; } void scsi_autopm_put_host(struct Scsi_Host *shost) { pm_runtime_put_sync(&shost->shost_gendev); } const struct dev_pm_ops scsi_bus_pm_ops = { .prepare = scsi_bus_prepare, .suspend = scsi_bus_suspend, .resume = scsi_bus_resume, .freeze = scsi_bus_freeze, .thaw = scsi_bus_thaw, .poweroff = scsi_bus_poweroff, .restore = scsi_bus_restore, .runtime_suspend = scsi_runtime_suspend, .runtime_resume = scsi_runtime_resume, .runtime_idle = scsi_runtime_idle, };
64 12 62 48 47 67 52 46 18 59 11 5 6 11 11 11 21 3 6 12 1 11 3 1 16 18 21 7 14 6 11 21 21 21 10 11 10 12 12 12 20 20 20 20 20 6 6 3 3 3 6 6 6 1 5 3 6 1 1 5 5 5 2 1 1 4 1 5 3 2 4 2 4 37 6 41 42 6 16 37 18 20 23 14 14 2 16 8 11 49 1 47 30 30 1 29 30 82 82 81 50 50 1 17 49 80 24 79 79 79 30 15 18 2 12 21 10 30 78 4 19 1 18 19 17 19 8 2 3 1 9 19 45 43 7 50 50 44 20 45 14 15 52 53 7 7 20 9 9 19 19 19 11 11 127 126 1 127 127 127 127 1 160 157 35 12 32 32 32 153 32 12 161 1 1 159 158 2 13 154 155 159 141 42 159 12 152 44 139 1 1 36 2 2 107 107 2 1 106 4 4 20 20 20 13 12 13 20 20 20 20 5 5 7 7 1 1 1 1 1 38 38 38 21 21 28 11 38 20 28 18 16 3 17 17 2 7 17 12 7 1 1 1 2 15 5 10 6 18 1 22 9 21 2 2 17 5 5 8 20 21 21 34 1 6 27 28 28 7 7 7 28 2 28 28 28 33 22 27 4 22 22 1 23 2 5 6 3 10 3 7 10 10 20 1 2 4 13 3 14 3 49 49 11 11 11 50 50 50 49 32 7 38 38 11 81 45 50 2 2 172 171 144 76 76 37 13 161 161 160 14 14 162 14 161 142 82 1 81 50 2 2 2 44 37 9 44 81 132 132 132 130 10 131 136 136 136 1 1 9 6 16 16 12 12 10 12 1 5 7 6 61 60 1 61 47 1 34 15 26 25 25 26 26 24 46 17 45 2 1 1 33 31 2 1 35 1 33 33 2 35 2 34 2 34 35 21 15 3 16 2 35 35 41 41 41 40 2 1 41 41 4 4 4 4 17 17 17 17 2 2 2 2 11 11 11 11 1 1 1 66 66 57 7 47 57 44 32 7 12 12 43 25 7 25 8 18 1 23 52 66 65 11 11 5 1 1 1 3 2 2 1 1 1 1 1 1 1 1 1 1 6 4 1 1 18 14 6 18 3 1 3 1 1 1 2 2 2 2 3 6 3 4 4 2 6 2 1 2 2 2 4 4 4 2 3 1 3 3 3 2 2 2 2 3 2 3 3 3 2 2 3 3 3 3 3 10 3 7 21 23 2 43 24 26 25 3 26 12 12 11 1 1 1 1 15 1 15 15 15 15 15 4 4 4 1 1 2 2 2 2 4 1 2 8 10 10 10 3 7 1 1 211 211 50 50 50 49 50 50 48 50 50 49 50 49 52 32 20 81 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 3 3 3 3 3 3 3 13 13 13 6 5 9 3 12 12 12 12 10 2 12 7 5 5 1 4 12 12 9 3 12 14 3 13 6 6 6 6 6 6 6 4 4 4 4 4 12 12 12 12 12 2 1 11 12 11 54 183 181 79 1 77 2 2 2 2 15 14 15 24 5 15 7 1 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/compat.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/falloc.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/btrfs.h> #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> #include <linux/swap.h> #include <linux/migrate.h> #include <linux/sched/mm.h> #include <linux/iomap.h> #include <asm/unaligned.h> #include <linux/fsverity.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "print-tree.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" #include "volumes.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" #include "block-group.h" #include "space-info.h" #include "zoned.h" #include "subpage.h" struct btrfs_iget_args { u64 ino; struct btrfs_root *root; }; struct btrfs_dio_data { ssize_t submitted; struct extent_changeset *data_reserved; }; static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type); static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate); /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * * ilock_flags can have the following bit set: * * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt * return -EAGAIN * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock_shared(inode)) return -EAGAIN; else return 0; } inode_lock_shared(inode); } else { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock(inode)) return -EAGAIN; else return 0; } inode_lock(inode); } if (ilock_flags & BTRFS_ILOCK_MMAP) down_write(&BTRFS_I(inode)->i_mmap_lock); return 0; } /* * btrfs_inode_unlock - unock inode i_rwsem * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. */ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_MMAP) up_write(&BTRFS_I(inode)->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) inode_unlock_shared(inode); else inode_unlock(inode); } /* * Cleanup all submitted ordered extents in specified range to handle errors * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *locked_page, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; u64 page_start = page_offset(locked_page); u64 page_end = page_start + PAGE_SIZE - 1; struct page *page; while (index <= end_index) { /* * For locked page, we will call end_extent_writepage() on it * in run_delalloc_range() for the error handling. That * end_extent_writepage() function will call * btrfs_mark_ordered_io_finished() to clear page Ordered and * run the ordered extent accounting. * * Here we can't just clear the Ordered bit, or * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { index++; continue; } page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; /* * Here we just clear all Ordered bits for every page in the * range, then __endio_write_update_ordered() will handle * the ordered extent accounting for the range. */ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, offset, bytes); put_page(page); } /* The locked page covers the full range, nothing needs to be done */ if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) return; /* * In case this page belongs to the delalloc range being instantiated * then skip it, since the first page of a range is going to be * properly cleaned up by the caller of run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; offset = page_offset(locked_page) + PAGE_SIZE; } return __endio_write_update_ordered(inode, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) { int err; err = btrfs_init_acl(trans, inode, dir); if (!err) err = btrfs_xattr_security_init(trans, inode, dir, qstr); return err; } /* * this does all the hard work for inserting an inline extent into * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, bool extent_inserted, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, int compress_type, struct page **compressed_pages) { struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; int ret; size_t cur_size = size; unsigned long offset; ASSERT((compressed_size > 0 && compressed_pages) || (compressed_size == 0 && !compressed_pages)); if (compressed_size && compressed_pages) cur_size = compressed_size; if (!extent_inserted) { struct btrfs_key key; size_t datasize; key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = start; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (ret) goto fail; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, ei, trans->transid); btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_encryption(leaf, ei, 0); btrfs_set_file_extent_other_encoding(leaf, ei, 0); btrfs_set_file_extent_ram_bytes(leaf, ei, size); ptr = btrfs_file_extent_inline_start(ei); if (compress_type != BTRFS_COMPRESS_NONE) { struct page *cpage; int i = 0; while (compressed_size > 0) { cpage = compressed_pages[i]; cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); kunmap_atomic(kaddr); i++; ptr += cur_size; compressed_size -= cur_size; } btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { page = find_get_page(inode->i_mapping, start >> PAGE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); offset = offset_in_page(start); write_extent_buffer(leaf, kaddr + offset, ptr, size); kunmap_atomic(kaddr); put_page(page); } btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); /* * We align size to sectorsize for inline extents just for simplicity * sake. */ size = ALIGN(size, root->fs_info->sectorsize); ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size); if (ret) goto fail; /* * we're an inline extent, so nobody can * extend the file past i_size without locking * a page we already have locked. * * We must do any isize and inode updates * before we unlock the pages. Otherwise we * could end up racing with unlink. */ BTRFS_I(inode)->disk_i_size = inode->i_size; fail: return ret; } /* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. */ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, u64 end, size_t compressed_size, int compress_type, struct page **compressed_pages) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 isize = i_size_read(&inode->vfs_inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; u64 aligned_end = ALIGN(end, fs_info->sectorsize); u64 data_len = inline_len; int ret; struct btrfs_path *path; if (compressed_size) data_len = compressed_size; if (start > 0 || actual_end > fs_info->sectorsize || data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || (!compressed_size && (actual_end & (fs_info->sectorsize - 1)) == 0) || end + 1 < isize || data_len > fs_info->max_inline) { return 1; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); } trans->block_rsv = &inode->block_rsv; drop_args.path = path; drop_args.start = start; drop_args.end = aligned_end; drop_args.drop_cache = true; drop_args.replace_extent = true; if (compressed_size && compressed_pages) drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( compressed_size); else drop_args.extent_item_size = btrfs_file_extent_calc_inline_size( inline_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); ret = insert_inline_extent(trans, path, drop_args.extent_inserted, root, &inode->vfs_inode, start, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; goto out; } btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; goto out; } set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); out: /* * Don't forget to free the reserved space, as for inlined extent * it won't count as data extent, free them directly here. * And at reserve time, it's always aligned to page size, so * just free one page here. */ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; } struct async_extent { u64 start; u64 ram_size; u64 compressed_size; struct page **pages; unsigned long nr_pages; int compress_type; struct list_head list; }; struct async_chunk { struct inode *inode; struct page *locked_page; u64 start; u64 end; unsigned int write_flags; struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; atomic_t *pending; }; struct async_cow { /* Number of chunks in flight; must be first in the structure */ atomic_t num_chunks; struct async_chunk chunks[]; }; static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, struct page **pages, unsigned long nr_pages, int compress_type) { struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); BUG_ON(!async_extent); /* -ENOMEM */ async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; async_extent->pages = pages; async_extent->nr_pages = nr_pages; async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; } /* * Check if the inode has flags compatible with compression */ static inline bool inode_can_compress(struct btrfs_inode *inode) { /* Subpage doesn't support compression yet */ if (inode->root->fs_info->sectorsize < PAGE_SIZE) return false; if (inode->flags & BTRFS_INODE_NODATACOW || inode->flags & BTRFS_INODE_NODATASUM) return false; return true; } /* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", btrfs_ino(inode)); return 0; } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; /* defrag ioctl */ if (inode->defrag_compress) return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; if (btrfs_test_opt(fs_info, COMPRESS) || inode->flags & BTRFS_INODE_COMPRESS || inode->prop_compress) return btrfs_compress_heuristic(&inode->vfs_inode, start, end); return 0; } static inline void inode_should_defrag(struct btrfs_inode *inode, u64 start, u64 end, u64 num_bytes, u64 small_write) { /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); } /* * we create compressed extents in two phases. The first * phase compresses a range of pages that have already been * locked (both pages and state bits are locked). * * This is done inside an ordered work queue, and the compression * is spread across many cpus. The actual IO submission is step * two, and the ordered work queue takes care of making sure that * happens in the same order things were put onto the queue by * writepages and friends. * * If this code finds it can't get good compression, it puts an * entry onto the work queue to write the uncompressed bytes. This * makes sure that both compressed inodes and uncompressed inodes * are written in the same order that the flusher thread sent them * down. */ static noinline int compress_file_range(struct async_chunk *async_chunk) { struct inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; u64 actual_end; u64 i_size; int ret = 0; struct page **pages = NULL; unsigned long nr_pages; unsigned long total_compressed = 0; unsigned long total_in = 0; int i; int will_compress; int compress_type = fs_info->compress_type; int compressed_extents = 0; int redirty = 0; inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, SZ_16K); /* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and * unlock the page in truncate and fallocate, and then modify the i_size * later on. * * The barriers are to emulate READ_ONCE, remove that once i_size_read * does that for us. */ barrier(); i_size = i_size_read(inode); barrier(); actual_end = min_t(u64, i_size, end + 1); again: will_compress = 0; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0); nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED / PAGE_SIZE); /* * we don't want to send crud past the end of i_size through * compression, that's just a waste of CPU time. So, if the * end of the file is before the start of our current * requested range of bytes, we bail out to the uncompressed * cleanup code that can deal with all of this. * * It isn't really the fastest way to fix things, but this is a * very uncommon corner. */ if (actual_end <= start) goto cleanup_and_bail_uncompressed; total_compressed = actual_end - start; /* * skip compression for a small file range(<=blocksize) that * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) goto cleanup_and_bail_uncompressed; total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; ret = 0; /* * we do compression for mount -o compress and when the * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ if (inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { /* just bail out to the uncompressed code */ nr_pages = 0; goto cont; } if (BTRFS_I(inode)->defrag_compress) compress_type = BTRFS_I(inode)->defrag_compress; else if (BTRFS_I(inode)->prop_compress) compress_type = BTRFS_I(inode)->prop_compress; /* * we need to call clear_page_dirty_for_io on each * page in the range. Otherwise applications with the file * mmap'd can wander in and change the page contents while * we are compressing them. * * If the compression fails for any reason, we set the pages * dirty again later on. * * Note that the remaining part is redirtied, the start pointer * has moved, the end is the original one. */ if (!redirty) { extent_range_clear_dirty_for_io(inode, start, end); redirty = 1; } /* Compression level is applied here and only here */ ret = btrfs_compress_pages( compress_type | (fs_info->compress_level << 4), inode->i_mapping, start, pages, &nr_pages, &total_in, &total_compressed); if (!ret) { unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; /* zero the tail end of the last page, we might be * sending it down to disk */ if (offset) memzero_page(page, offset, PAGE_SIZE - offset); will_compress = 1; } } cont: /* * Check cow_file_range() for why we don't even try to create inline * extent for subpage case. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { /* lets try to make an inline extent */ if (ret || total_in < actual_end) { /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ ret = cow_file_range_inline(BTRFS_I(inode), start, end, 0, BTRFS_COMPRESS_NONE, NULL); } else { /* try making a compressed inline extent */ ret = cow_file_range_inline(BTRFS_I(inode), start, end, total_compressed, compress_type, pages); } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING; unsigned long page_error_op; page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; /* * inline extent creation worked or returned error, * we don't need to create any more async work items. * Unlock and free up our temp pages. * * We use DO_ACCOUNTING here because we need the * delalloc_release_metadata to be done _after_ we drop * our outstanding extent for clearing delalloc for this * range. */ extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, NULL, clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); /* * Ensure we only free the compressed pages if we have * them allocated, as we can still reach here with * inode_need_compress() == false. */ if (pages) { for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } kfree(pages); } return 0; } } if (will_compress) { /* * we aren't doing an inline extent round the compressed size * up to a block size boundary so the allocator does sane * things */ total_compressed = ALIGN(total_compressed, blocksize); /* * one last check to make sure the compression is really a * win, compare the page count read with the blocks on disk, * compression must free at least one sector size */ total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed + blocksize <= total_in) { compressed_extents++; /* * The async work queues will take care of doing actual * allocation on disk for these compressed pages, and * will submit them to the elevator. */ add_async_extent(async_chunk, start, total_in, total_compressed, pages, nr_pages, compress_type); if (start + total_in < end) { start += total_in; pages = NULL; cond_resched(); goto again; } return compressed_extents; } } if (pages) { /* * the compression code ran but failed to make things smaller, * free any pages it allocated and our page pointer array */ for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } kfree(pages); pages = NULL; total_compressed = 0; nr_pages = 0; /* flag the file so we don't compress in the future */ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !(BTRFS_I(inode)->prop_compress)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } } cleanup_and_bail_uncompressed: /* * No compression, but we still need to write the pages in the file * we've been given so far. redirty the locked page if it corresponds * to our extent and set things up for the async work queue to run * cow_file_range to do the normal delalloc dance. */ if (async_chunk->locked_page && (page_offset(async_chunk->locked_page) >= start && page_offset(async_chunk->locked_page)) <= end) { __set_page_dirty_nobuffers(async_chunk->locked_page); /* unlocked later on in the async handlers */ } if (redirty) extent_range_redirty_for_io(inode, start, end); add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); compressed_extents++; return compressed_extents; } static void free_async_extent_pages(struct async_extent *async_extent) { int i; if (!async_extent->pages) return; for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); put_page(async_extent->pages[i]); } kfree(async_extent->pages); async_extent->nr_pages = 0; async_extent->pages = NULL; } /* * phase two of compressed writeback. This is the ordered portion * of the code, which only gets called in the order the work was * queued. We walk all the async extents created by compress_file_range * and send them down to the disk. */ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_extent *async_extent; u64 alloc_hint = 0; struct btrfs_key ins; struct extent_map *em; struct btrfs_root *root = inode->root; struct extent_io_tree *io_tree = &inode->io_tree; int ret = 0; again: while (!list_empty(&async_chunk->extents)) { async_extent = list_entry(async_chunk->extents.next, struct async_extent, list); list_del(&async_extent->list); retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); /* did the compression code fall back to uncompressed IO? */ if (!async_extent->pages) { int page_started = 0; unsigned long nr_written = 0; /* allocate blocks */ ret = cow_file_range(inode, async_chunk->locked_page, async_extent->start, async_extent->start + async_extent->ram_size - 1, &page_started, &nr_written, 0); /* JDM XXX */ /* * if page_started, cow_file_range inserted an * inline extent and took care of all the unlocking * and IO for us. Otherwise, we need to submit * all those pages down to the drive. */ if (!page_started && !ret) extent_write_locked_range(&inode->vfs_inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, WB_SYNC_ALL); else if (ret && async_chunk->locked_page) unlock_page(async_chunk->locked_page); kfree(async_extent); cond_resched(); continue; } ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1, 1); if (ret) { free_async_extent_pages(async_extent); if (ret == -ENOSPC) { unlock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); /* * we need to redirty the pages if we decide to * fallback to uncompressed IO, otherwise we * will not submit these pages down to lower * layers. */ extent_range_redirty_for_io(&inode->vfs_inode, async_extent->start, async_extent->start + async_extent->ram_size - 1); goto retry; } goto out_free; } /* * here we're doing allocation and writeback of the * compressed pages */ em = create_io_em(inode, async_extent->start, async_extent->ram_size, /* len */ async_extent->start, /* orig_start */ ins.objectid, /* block_start */ ins.offset, /* block_len */ ins.offset, /* orig_block_len */ async_extent->ram_size, /* ram_bytes */ async_extent->compress_type, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) /* ret value is not necessary due to void function */ goto out_free_reserve; free_extent_map(em); ret = btrfs_add_ordered_extent_compress(inode, async_extent->start, ins.objectid, async_extent->ram_size, ins.offset, async_extent->compress_type); if (ret) { btrfs_drop_extent_cache(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, 0); goto out_free_reserve; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); /* * clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages, async_chunk->write_flags, async_chunk->blkcg_css)) { struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; p->mapping = inode->vfs_inode.i_mapping; btrfs_writepage_endio_finish_ordered(inode, p, start, end, false); p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, NULL, 0, PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); } alloc_hint = ins.objectid + ins.offset; kfree(async_extent); cond_resched(); } return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); kfree(async_extent); goto again; } static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 num_bytes) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 alloc_hint = 0; read_lock(&em_tree->lock); em = search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it. */ if (em->block_start >= EXTENT_MAP_LAST_BYTE) { free_extent_map(em); em = search_extent_mapping(em_tree, 0, 0); if (em && em->block_start < EXTENT_MAP_LAST_BYTE) alloc_hint = em->block_start; if (em) free_extent_map(em); } else { alloc_hint = em->block_start; free_extent_map(em); } } read_unlock(&em_tree->lock); return alloc_hint; } /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * * locked_page is the page that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * * *page_started is set to one if we unlock locked_page and do everything * required to start IO on it. It may be clean and already done with * IO when we return. * * When unlock == 1, we unlock the pages in successfully allocated regions. * When unlock == 0, we leave them locked for writing them out. * * However, we unlock all the pages except @locked_page in case of failure. * * In summary, page locking state will be as follow: * * - page_started == 1 (return value) * - All the pages are unlocked. IO is started. * - Note that this can happen only on success * - unlock == 1 * - All the pages except @locked_page are unlocked in any case * - unlock == 0 * - On success, all the pages are locked for writing out them * - On failure, all the pages except @locked_page are unlocked * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept * intact. So, the caller must clean them up by calling * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; unsigned clear_bits; unsigned long page_ops; bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { ret = -EINVAL; goto out_unlock; } num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); inode_should_defrag(inode, start, end, num_bytes, SZ_64K); /* * Due to the page size limit, for subpage we can only trigger the * writeback for the dirty sectors of page, that means data writeback * is doing more writeback than what we want. * * This is especially unexpected for some call sites like fallocate, * where we only increase i_size after everything is done. * This means we can trigger inline extent even if we didn't want to. * So here we skip inline extent creation completely. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, start, end, 0, BTRFS_COMPRESS_NONE, NULL); if (ret == 0) { /* * We use DO_ACCOUNTING here because we need the * delalloc_release_metadata to be run _after_ we drop * our outstanding extent for clearing delalloc for this * range. */ extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; /* * locked_page is locked by the caller of * writepage_delalloc(), not locked by * __process_pages_contig(). * * We can't let __process_pages_contig() to unlock it, * as it doesn't have any subpage::writers recorded. * * Here we manually unlock the page, since the caller * can't use page_started to determine if it's an * inline extent or a compressed extent. */ unlock_page(locked_page); goto out; } else if (ret < 0) { goto out_unlock; } } alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); /* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the * extents. However, due to an operation such as scrub turning a block * group to RO mode, it may fallback to COW mode, so we must make sure * an extent allocated during COW has exactly the requested size and can * not be split into smaller extents, otherwise relocation breaks and * fails during the stage where it updates the bytenr of file extent * items. */ if (btrfs_is_data_reloc_root(root)) min_alloc_size = num_bytes; else min_alloc_size = fs_info->sectorsize; while (num_bytes > 0) { cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; extent_reserved = true; ram_size = ins.offset; em = create_io_em(inode, start, ins.offset, /* len */ start, /* orig_start */ ins.objectid, /* block_start */ ins.offset, /* block_len */ ins.offset, /* orig_block_len */ ram_size, /* ram_bytes */ BTRFS_COMPRESS_NONE, /* compress_type */ BTRFS_ORDERED_REGULAR /* type */); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_reserve; } free_extent_map(em); ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, BTRFS_ORDERED_REGULAR); if (ret) goto out_drop_extent_cache; if (btrfs_is_data_reloc_root(root)) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); /* * Only drop cache here, and process as normal. * * We must not allow extent_clear_unlock_delalloc() * at out_unlock label to free meta of this ordered * extent, as its meta should be freed by * btrfs_finish_ordered_io(). * * So we must continue until @start is increased to * skip current ordered extent. */ if (ret) btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); } btrfs_dec_block_group_reservations(fs_info, ins.objectid); /* * We're not doing compressed IO, don't unlock the first page * (which the caller expects to stay locked), don't clear any * dirty bits and don't set any writeback bits * * Do set the Ordered (Private2) bit so we know this page was * properly setup for writepage. */ page_ops = unlock ? PAGE_UNLOCK : 0; page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; extent_reserved = false; /* * btrfs_reloc_clone_csums() error, since start is increased * extent_clear_unlock_delalloc() at out_unlock label won't * free metadata of current ordered extent, we're OK to exit. */ if (ret) goto out_unlock; } out: return ret; out_drop_extent_cache: btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: /* * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| * `- orig_start `- start `- start + cur_alloc_size `- end * * We process each region below. */ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (1). We have already instantiated the ordered extents * for this region. They are cleaned up by * btrfs_cleanup_ordered_extents() in e.g, * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup * function. * * However, in case of unlock == 0, we still need to unlock the pages * (except @locked_page) to ensure all the pages are unlocked. */ if (!unlock && orig_start < start) extent_clear_unlock_delalloc(inode, orig_start, start - 1, locked_page, 0, page_ops); /* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, * then it means that when we reserved the extent we decremented the * extent's size from the data space_info's bytes_may_use counter and * incremented the space_info's bytes_reserved counter by the same * amount. We must make sure extent_clear_unlock_delalloc() does not try * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_page, clear_bits, page_ops); start += cur_alloc_size; if (start >= end) goto out; } /* * For the range (3). We never touched the region. In addition to the * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); goto out; } /* * work queue call back to started compression on a file and pages */ static noinline void async_cow_start(struct btrfs_work *work) { struct async_chunk *async_chunk; int compressed_extents; async_chunk = container_of(work, struct async_chunk, work); compressed_extents = compress_file_range(async_chunk); if (compressed_extents == 0) { btrfs_add_delayed_iput(async_chunk->inode); async_chunk->inode = NULL; } } /* * work queue call back to submit previously compressed pages */ static noinline void async_cow_submit(struct btrfs_work *work) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); unsigned long nr_pages; nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; /* * ->inode could be NULL if async_chunk_start has failed to compress, * in which case we don't have anything to submit, yet we need to * always adjust ->async_delalloc_pages as its paired with the init * happening in cow_file_range_async */ if (async_chunk->inode) submit_compressed_extents(async_chunk); /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 5 * SZ_1M) cond_wake_up_nomb(&fs_info->async_submit_wait); } static noinline void async_cow_free(struct btrfs_work *work) { struct async_chunk *async_chunk; async_chunk = container_of(work, struct async_chunk, work); if (async_chunk->inode) btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); /* * Since the pointer to 'pending' is at the beginning of the array of * async_chunk's, freeing it ensures the whole array has been freed. */ if (atomic_dec_and_test(async_chunk->pending)) kvfree(async_chunk->pending); } static int cow_file_range_async(struct btrfs_inode *inode, struct writeback_control *wbc, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); struct async_cow *ctx; struct async_chunk *async_chunk; unsigned long nr_pages; u64 cur_end; u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); int i; bool should_compress; unsigned nofs_flag; const unsigned int write_flags = wbc_to_write_flags(wbc); unlock_extent(&inode->io_tree, start, end); if (inode->flags & BTRFS_INODE_NOCOMPRESS && !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { num_chunks = 1; should_compress = false; } else { should_compress = true; } nofs_flag = memalloc_nofs_save(); ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!ctx) { unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING; unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK | PAGE_SET_ERROR; extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits, page_ops); return -ENOMEM; } async_chunk = ctx->chunks; atomic_set(&ctx->num_chunks, num_chunks); for (i = 0; i < num_chunks; i++) { if (should_compress) cur_end = min(end, start + SZ_512K - 1); else cur_end = end; /* * igrab is called higher up in the call chain, take only the * lightweight reference for the callback lifetime */ ihold(&inode->vfs_inode); async_chunk[i].pending = &ctx->num_chunks; async_chunk[i].inode = &inode->vfs_inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; async_chunk[i].write_flags = write_flags; INIT_LIST_HEAD(&async_chunk[i].extents); /* * The locked_page comes all the way from writepage and its * the original page we were actually given. As we spread * this large delalloc region across multiple async_chunk * structs, only the first struct needs a pointer to locked_page * * This way we don't need racey decisions about who is supposed * to unlock it. */ if (locked_page) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to * be accounted against wbc once. Let's do it here * before the paths diverge. wbc accounting is used * only for foreign writeback detection and doesn't * need full accuracy. Just account the whole thing * against the first page. */ wbc_account_cgroup_owner(wbc, locked_page, cur_end - start); async_chunk[i].locked_page = locked_page; locked_page = NULL; } else { async_chunk[i].locked_page = NULL; } if (blkcg_css != blkcg_root_css) { css_get(blkcg_css); async_chunk[i].blkcg_css = blkcg_css; } else { async_chunk[i].blkcg_css = NULL; } btrfs_init_work(&async_chunk[i].work, async_cow_start, async_cow_submit, async_cow_free); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); *nr_written += nr_pages; start = cur_end + 1; } *page_started = 1; return 0; } static noinline int run_delalloc_zoned(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written) { int ret; ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 0); if (ret) return ret; if (*page_started) return 0; __set_page_dirty_nobuffers(locked_page); account_page_redirty(locked_page); extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL); *page_started = 1; return 0; } static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { int ret; struct btrfs_ordered_sum *sums; LIST_HEAD(list); ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr, bytenr + num_bytes - 1, &list, 0); if (ret == 0 && list_empty(&list)) return 0; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); list_del(&sums->list); kfree(sums); } if (ret < 0) return ret; return 1; } static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end, int *page_started, unsigned long *nr_written) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; u64 count; /* * If EXTENT_NORESERVE is set it means that when the buffered write was * made we had not enough available data space and therefore we did not * reserve data space for it, since we though we could do NOCOW for the * respective file range (either there is prealloc extent or the inode * has the NOCOW bit set). * * However when we need to fallback to COW mode (because for example the * block group for the corresponding extent was turned to RO mode by a * scrub or relocation) we need to do the following: * * 1) We increment the bytes_may_use counter of the data space info. * If COW succeeds, it allocates a new data extent and after doing * that it decrements the space info's bytes_may_use counter and * increments its bytes_reserved counter by the same amount (we do * this at btrfs_add_reserved_bytes()). So we need to increment the * bytes_may_use counter to compensate (when space is reserved at * buffered write time, the bytes_may_use counter is incremented); * * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so * that if the COW path fails for any reason, it decrements (through * extent_clear_unlock_delalloc()) the bytes_may_use counter of the * data space info, which we incremented in the step above. * * If we need to fallback to cow and the inode corresponds to a free * space cache inode or an inode of the data relocation tree, we must * also increment bytes_may_use of the data space_info for the same * reason. Space caches and relocated data extents always get a prealloc * extent for them, however scrub or balance may have set the block * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ count = count_range_bits(io_tree, &range_start, end, range_bytes, EXTENT_NORESERVE, 0); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_space_info *sinfo = fs_info->data_sinfo; if (is_space_ino || is_reloc_ino) bytes = range_bytes; spin_lock(&sinfo->lock); btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, 0, 0, NULL); } return cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); } /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end, int *page_started, unsigned long *nr_written) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; u64 cur_offset = start; int ret; bool check_prev = true; const bool freespace_inode = btrfs_is_free_space_inode(inode); u64 ino = btrfs_ino(inode); bool nocow = false; u64 disk_bytenr = 0; const bool force = inode->flags & BTRFS_INODE_NODATACOW; path = btrfs_alloc_path(); if (!path) { extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return -ENOMEM; } while (1) { struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; u64 extent_end; u64 extent_offset; u64 num_bytes = 0; u64 disk_num_bytes; u64 ram_bytes; int extent_type; nocow = false; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) goto error; /* * If there is no extent for our range when doing the initial * search, then go back to the previous slot as it will be the * one containing the search offset */ if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.objectid == ino && found_key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } check_prev = false; next_slot: /* Go to next leaf if we have exhausted the current one */ leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) { if (cow_start != (u64)-1) cur_offset = cow_start; goto error; } if (ret > 0) break; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* Didn't find anything for our INO */ if (found_key.objectid > ino) break; /* * Keep searching until we find an EXTENT_ITEM or there are no * more extents for this inode */ if (WARN_ON_ONCE(found_key.objectid < ino) || found_key.type < BTRFS_EXTENT_DATA_KEY) { path->slots[0]++; goto next_slot; } /* Found key is not EXTENT_DATA_KEY or starts after req range */ if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; /* * If the found extent starts after requested offset, then * adjust extent_end to be right before this extent begins */ if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; goto out_check; } /* * Found extent which begins before our range and potentially * intersect it */ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = found_key.offset + btrfs_file_extent_num_bytes(leaf, fi); disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); /* * If the extent we got ends before our current offset, * skip to the next extent. */ if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } /* Skip holes */ if (disk_bytenr == 0) goto out_check; /* Skip compressed/encrypted/encoded extents */ if (btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out_check; /* * If extent is created before the last volume's snapshot * this implies the extent is shared, hence we can't do * nocow. This is the same check as in * btrfs_cross_ref_exist but without calling * btrfs_search_slot. */ if (!freespace_inode && btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; /* * The following checks can be expensive, as they need to * take other locks and do btree or rbtree searches, so * release the path to avoid blocking other tasks for too * long. */ btrfs_release_path(path); ret = btrfs_cross_ref_exist(root, ino, found_key.offset - extent_offset, disk_bytenr, false); if (ret) { /* * ret could be -EIO if the above fails to read * metadata. */ if (ret < 0) { if (cow_start != (u64)-1) cur_offset = cow_start; goto error; } WARN_ON_ONCE(freespace_inode); goto out_check; } disk_bytenr += extent_offset; disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* * If there are pending snapshots for this root, we * fall into common COW way */ if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) goto out_check; /* * force cow if csum exists in the range. * this ensure that csum for a given extent are * either valid or do not exist. */ ret = csum_exist_in_range(fs_info, disk_bytenr, num_bytes); if (ret) { /* * ret could be -EIO if the above fails to read * metadata. */ if (ret < 0) { if (cow_start != (u64)-1) cur_offset = cow_start; goto error; } WARN_ON_ONCE(freespace_inode); goto out_check; } /* If the extent's block group is RO, we must COW */ if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) goto out_check; nocow = true; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + ram_bytes; extent_end = ALIGN(extent_end, fs_info->sectorsize); /* Skip extents outside of our requested range */ if (extent_end <= start) { path->slots[0]++; goto next_slot; } } else { /* If this triggers then we have a memory corruption */ BUG(); } out_check: /* * If nocow is false then record the beginning of the range * that needs to be COWed */ if (!nocow) { if (cow_start == (u64)-1) cow_start = cur_offset; cur_offset = extent_end; if (cur_offset > end) break; if (!path->nodes[0]) continue; path->slots[0]++; goto next_slot; } /* * COW range from cow_start to found_key.offset - 1. As the key * will contain the beginning of the first extent that can be * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_page, cow_start, found_key.offset - 1, page_started, nr_written); if (ret) goto error; cow_start = (u64)-1; } if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 orig_start = found_key.offset - extent_offset; struct extent_map *em; em = create_io_em(inode, cur_offset, num_bytes, orig_start, disk_bytenr, /* block_start */ num_bytes, /* block_len */ disk_num_bytes, /* orig_block_len */ ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { ret = PTR_ERR(em); goto error; } free_extent_map(em); ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, BTRFS_ORDERED_PREALLOC); if (ret) { btrfs_drop_extent_cache(inode, cur_offset, cur_offset + num_bytes - 1, 0); goto error; } } else { ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, BTRFS_ORDERED_NOCOW); if (ret) goto error; } if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); nocow = false; if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent * extent_clear_unlock_delalloc() in error handler * from freeing metadata of created ordered extent. */ ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); cur_offset = extent_end; /* * btrfs_reloc_clone_csums() error, now we're OK to call error * handler, as metadata for created ordered extent will only * be freed by btrfs_finish_ordered_io(). */ if (ret) goto error; if (cur_offset > end) break; } btrfs_release_path(path); if (cur_offset <= end && cow_start == (u64)-1) cow_start = cur_offset; if (cow_start != (u64)-1) { cur_offset = end; ret = fallback_to_cow(inode, locked_page, cow_start, end, page_started, nr_written); if (ret) goto error; } error: if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); if (ret && cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_free_path(path); return ret; } static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL)) return false; return true; } return false; } /* * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc) { int ret; const bool zoned = btrfs_is_zoned(inode->root->fs_info); if (should_nocow(inode, start, end)) { /* * Normally on a zoned device we're only doing COW writes, but * in case of relocation on a zoned filesystem we have taken * precaution, that we're only writing sequentially. It's safe * to use run_delalloc_nocow() here, like for regular * preallocated inodes. */ ASSERT(!zoned || (zoned && btrfs_is_data_reloc_root(inode->root))); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { if (zoned) ret = run_delalloc_zoned(inode, locked_page, start, end, page_started, nr_written); else ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, page_started, nr_written); } ASSERT(ret <= 0); if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); return ret; } void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 size; /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; size = orig->end - orig->start + 1; if (size > fs_info->max_extent_size) { u32 num_extents; u64 new_size; /* * See the explanation in btrfs_merge_delalloc_extent, the same * applies here, just in reverse. */ new_size = orig->end - split + 1; num_extents = count_max_extents(fs_info, new_size); new_size = split - orig->start; num_extents += count_max_extents(fs_info, new_size); if (count_max_extents(fs_info, size) >= num_extents) return; } spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); } /* * Handle merged delayed allocation extents so we can keep track of new extents * that are just merged onto old extents, such as when we are doing sequential * writes, so we can properly account for the metadata space we'll need. */ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size, old_size; u32 num_extents; /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; if (new->start > other->start) new_size = new->end - other->start + 1; else new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ if (new_size <= fs_info->max_extent_size) { spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); return; } /* * We have to add up either side to figure out how many extents were * accounted for before we merged into one big extent. If the number of * extents we accounted for is <= the amount we need for the new range * then we can return, otherwise drop. Think of it like this * * [ 4k][MAX_SIZE] * * So we've grown the extent by a MAX_SIZE extent, this would mean we * need 2 outstanding extents, on one side we have 1 and the other side * we have 1 so they are == and we can return. But in this case * * [MAX_SIZE+4k][MAX_SIZE+4k] * * Each range on their own accounts for 2 extents, but merged together * they are only 3 extents worth of accounting, so we need to drop in * this case. */ old_size = other->end - other->start + 1; num_extents = count_max_extents(fs_info, old_size); old_size = new->end - new->start + 1; num_extents += count_max_extents(fs_info, old_size); if (count_max_extents(fs_info, new_size) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); } static void btrfs_add_delalloc_inodes(struct btrfs_root *root, struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); spin_lock(&root->delalloc_lock); if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->delalloc_inodes); set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags); root->nr_delalloc_inodes++; if (root->nr_delalloc_inodes == 1) { spin_lock(&fs_info->delalloc_root_lock); BUG_ON(!list_empty(&root->delalloc_root)); list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); } } spin_unlock(&root->delalloc_lock); } void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = root->fs_info; if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); BUG_ON(list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); spin_unlock(&fs_info->delalloc_root_lock); } } } static void btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode) { spin_lock(&root->delalloc_lock); __btrfs_del_delalloc_inode(root, inode); spin_unlock(&root->delalloc_lock); } /* * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. */ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); spin_unlock(&BTRFS_I(inode)->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) return; percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; if (*bits & EXTENT_DEFRAG) BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) btrfs_add_delalloc_inodes(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } if (!(state->state & EXTENT_DELALLOC_NEW) && (*bits & EXTENT_DELALLOC_NEW)) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - state->start; spin_unlock(&BTRFS_I(inode)->lock); } } /* * Once a range is no longer delalloc this function ensures that proper * accounting happens. */ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, struct extent_state *state, unsigned *bits) { struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; spin_unlock(&inode->lock); } /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -num_extents); spin_unlock(&inode->lock); /* * We don't reserve metadata space for space cache inodes so we * don't need to call delalloc_release_metadata if there is an * error. */ if (*bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, true); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) return; if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; if (do_list && inode->delalloc_bytes == 0 && test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags)) btrfs_del_delalloc_inode(root, inode); spin_unlock(&inode->lock); } if ((state->state & EXTENT_DELALLOC_NEW) && (*bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; if (*bits & EXTENT_ADD_INODE_BYTES) inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } } /* * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit * in a chunk's stripe. This function ensures that bios do not span a * stripe/chunk * * @page - The page we are about to add to the bio * @size - size we want to add to the bio * @bio - bio we want to ensure is smaller than a stripe * @bio_flags - flags of the bio * * return 1 if page cannot be added to the bio * return 0 if page can be added to the bio * return error otherwise */ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, unsigned long bio_flags) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = bio->bi_iter.bi_sector << 9; u32 bio_len = bio->bi_iter.bi_size; struct extent_map *em; int ret = 0; struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); if (IS_ERR(em)) return PTR_ERR(em); ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); if (ret < 0) goto out; if (geom.len < bio_len + size) ret = 1; out: free_extent_map(em); return ret; } /* * in order to insert checksums into the metadata in large chunks, * we wait until bio submission time. All the pages in the bio are * checksummed and sums are attached onto the ordered extent record. * * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, u64 dio_file_offset) { return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } /* * Split an extent_map at [start, start + len] * * This function is intended to be used only for extract_ordered_extent(). */ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, u64 post) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; struct extent_map *split_pre = NULL; struct extent_map *split_mid = NULL; struct extent_map *split_post = NULL; int ret = 0; unsigned long flags; /* Sanity check */ if (pre == 0 && post == 0) return 0; split_pre = alloc_extent_map(); if (pre) split_mid = alloc_extent_map(); if (post) split_post = alloc_extent_map(); if (!split_pre || (pre && !split_mid) || (post && !split_post)) { ret = -ENOMEM; goto out; } ASSERT(pre + post < len); lock_extent(&inode->io_tree, start, start + len - 1); write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (!em) { ret = -EIO; goto out_unlock; } ASSERT(em->len == len); ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); ASSERT(!list_empty(&em->list)); flags = em->flags; clear_bit(EXTENT_FLAG_PINNED, &em->flags); /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; split_pre->len = (pre ? pre : em->len - post); split_pre->orig_start = split_pre->start; split_pre->block_start = em->block_start; split_pre->block_len = split_pre->len; split_pre->orig_block_len = split_pre->block_len; split_pre->ram_bytes = split_pre->len; split_pre->flags = flags; split_pre->compress_type = em->compress_type; split_pre->generation = em->generation; replace_extent_mapping(em_tree, em, split_pre, 1); /* * Now we only have an extent_map at: * [em->start, em->start + pre] if pre != 0 * [em->start, em->start + em->len - post] if pre == 0 */ if (pre) { /* Insert the middle extent_map */ split_mid->start = em->start + pre; split_mid->len = em->len - pre - post; split_mid->orig_start = split_mid->start; split_mid->block_start = em->block_start + pre; split_mid->block_len = split_mid->len; split_mid->orig_block_len = split_mid->block_len; split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; split_mid->compress_type = em->compress_type; split_mid->generation = em->generation; add_extent_mapping(em_tree, split_mid, 1); } if (post) { split_post->start = em->start + em->len - post; split_post->len = post; split_post->orig_start = split_post->start; split_post->block_start = em->block_start + em->len - post; split_post->block_len = split_post->len; split_post->orig_block_len = split_post->block_len; split_post->ram_bytes = split_post->len; split_post->flags = flags; split_post->compress_type = em->compress_type; split_post->generation = em->generation; add_extent_mapping(em_tree, split_post, 1); } /* Once for us */ free_extent_map(em); /* Once for the tree */ free_extent_map(em); out_unlock: write_unlock(&em_tree->lock); unlock_extent(&inode->io_tree, start, start + len - 1); out: free_extent_map(split_pre); free_extent_map(split_mid); free_extent_map(split_post); return ret; } static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, struct bio *bio, loff_t file_offset) { struct btrfs_ordered_extent *ordered; u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 file_len; u64 len = bio->bi_iter.bi_size; u64 end = start + len; u64 ordered_end; u64 pre, post; int ret = 0; ordered = btrfs_lookup_ordered_extent(inode, file_offset); if (WARN_ON_ONCE(!ordered)) return BLK_STS_IOERR; /* No need to split */ if (ordered->disk_num_bytes == len) goto out; /* We cannot split once end_bio'd ordered extent */ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) { ret = -EINVAL; goto out; } /* We cannot split a compressed ordered extent */ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) { ret = -EINVAL; goto out; } ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes; /* bio must be in one ordered extent */ if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) { ret = -EINVAL; goto out; } /* Checksum list should be empty */ if (WARN_ON_ONCE(!list_empty(&ordered->list))) { ret = -EINVAL; goto out; } file_len = ordered->num_bytes; pre = start - ordered->disk_bytenr; post = ordered_end - end; ret = btrfs_split_ordered_extent(ordered, pre, post); if (ret) goto out; ret = split_zoned_em(inode, file_offset, file_len, pre, post); out: btrfs_put_ordered_extent(ordered); return errno_to_blk_status(ret); } /* * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read. * * Rules about async/sync submit, * a) read: sync submit * * b) write without checksum: sync submit * * c) write with checksum: * c-1) if bio is issued by fsync: sync submit * (sync_writers != 0) * * c-2) if root is reloc root: sync submit * (only in case of buffered IO) * * c-3) otherwise: async submit */ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; blk_status_t ret = 0; int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || !fs_info->csum_root; if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { struct page *page = bio_first_bvec_all(bio)->bv_page; loff_t file_offset = page_offset(page); ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); if (ret) goto out; } if (btrfs_op(bio) != BTRFS_MAP_WRITE) { ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); if (ret) goto out; if (bio_flags & EXTENT_BIO_COMPRESSED) { ret = btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); goto out; } else { /* * Lookup bio sums does extra checks around whether we * need to csum or not, which is why we ignore skip_sum * here. */ ret = btrfs_lookup_bio_sums(inode, bio, NULL); if (ret) goto out; } goto mapit; } else if (async && !skip_sum) { /* csum items have already been cloned */ if (btrfs_is_data_reloc_root(root)) goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, 0, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); if (ret) goto out; } mapit: ret = btrfs_map_bio(fs_info, bio, mirror_num); out: if (ret) { bio->bi_status = ret; bio_endio(bio); } return ret; } /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ static int add_pending_csums(struct btrfs_trans_handle *trans, struct list_head *list) { struct btrfs_ordered_sum *sum; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); trans->adding_csums = false; if (ret) return ret; } return 0; } static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, const u64 start, const u64 len, struct extent_state **cached_state) { u64 search_start = start; const u64 end = start + len - 1; while (search_start < end) { const u64 search_len = end - search_start + 1; struct extent_map *em; u64 em_len; int ret = 0; em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); if (em->block_start != EXTENT_MAP_HOLE) goto next; em_len = em->len; if (em->start < search_start) em_len -= search_start - em->start; if (em_len > search_len) em_len = search_len; ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, EXTENT_DELALLOC_NEW, 0, NULL, cached_state, GFP_NOFS, NULL); next: search_start = extent_map_end(em); free_extent_map(em); if (ret) return ret; } return 0; } int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state) { WARN_ON(PAGE_ALIGNED(end)); if (start >= i_size_read(&inode->vfs_inode) && !(inode->flags & BTRFS_INODE_PREALLOC)) { /* * There can't be any extents following eof in this case so just * set the delalloc new bit for the range directly. */ extra_bits |= EXTENT_DELALLOC_NEW; } else { int ret; ret = btrfs_find_new_delalloc_bytes(inode, start, end + 1 - start, cached_state); if (ret) return ret; } return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; struct inode *inode; struct btrfs_work work; }; static void btrfs_writepage_fixup_worker(struct btrfs_work *work) { struct btrfs_writepage_fixup *fixup; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct page *page; struct btrfs_inode *inode; u64 page_start; u64 page_end; int ret = 0; bool free_delalloc_space = true; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; inode = BTRFS_I(fixup->inode); page_start = page_offset(page); page_end = page_offset(page) + PAGE_SIZE - 1; /* * This is similar to page_mkwrite, we need to reserve the space before * we take the page lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, PAGE_SIZE); again: lock_page(page); /* * Before we queued this fixup, we took a reference on the page. * page->mapping may go NULL, but it shouldn't be moved to a different * address space. */ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { /* * Unfortunately this is a little tricky, either * * 1) We got here and our page had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. * 2) Our page was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but * because the page was already dealt with we don't want to * mark the page with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC * when the page was already properly dealt with. */ if (!ret) { btrfs_delalloc_release_extents(inode, PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); } ret = 0; goto out_page; } /* * We can't mess with the page state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) goto out_page; lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PageOrdered(page)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state); if (ret) goto out_reserved; /* * Everything went as planned, we're now the owner of a dirty page with * delayed allocation bits set and space reserved for our COW * destination. * * The page was dirty when we started, nothing should have cleaned it. */ BUG_ON(!PageDirty(page)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); clear_page_dirty_for_io(page); SetPageError(page); } ClearPageChecked(page); unlock_page(page); put_page(page); kfree(fixup); extent_changeset_free(data_reserved); /* * As a precaution, do a delayed iput in case it would be the last iput * that could need flushing space. Recursing back to fixup worker would * deadlock. */ btrfs_add_delayed_iput(&inode->vfs_inode); } /* * There are a few paths in the higher layers of the kernel that directly * set the page dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ int btrfs_writepage_cow_fixup(struct page *page) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_writepage_fixup *fixup; /* This page has ordered extent covering it already */ if (PageOrdered(page)) return 0; /* * PageChecked is set below when we create a fixup worker for this page, * don't try to create another one if we're already PageChecked() * * The extent_io writepage code will redirty the page if we send back * EAGAIN. */ if (PageChecked(page)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); if (!fixup) return -EAGAIN; /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the page lock, and we can't trust * page->mapping outside of the page lock. */ ihold(inode); SetPageChecked(page); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; fixup->inode = inode; btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EAGAIN; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 file_pos, struct btrfs_file_extent_item *stack_fi, const bool update_inode_bytes, u64 qgroup_reserved) { struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); struct btrfs_drop_extents_args drop_args = { 0 }; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * we may be replacing one extent in the tree with another. * The new extent is pinned in the extent map, and we don't want * to drop it from the cache until it is completely in the btree. * * So, tell btrfs_drop_extents to leave this extent in the cache. * the caller is expected to unpin it and allow it to be merged * with the others. */ drop_args.path = path; drop_args.start = file_pos; drop_args.end = file_pos + num_bytes; drop_args.replace_extent = true; drop_args.extent_item_size = sizeof(*stack_fi); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); if (ret) goto out; } leaf = path->nodes[0]; btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); write_extent_buffer(leaf, stack_fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); /* * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the * number of bytes only for that range containing the inline extent. * The remaining of the range will be processed when clearning the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { u64 inline_size = round_down(drop_args.bytes_found, sectorsize); inline_size = drop_args.bytes_found - inline_size; btrfs_update_inode_bytes(inode, sectorsize, inline_size); drop_args.bytes_found -= inline_size; num_bytes -= sectorsize; } if (update_inode_bytes) btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) goto out; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), file_pos, qgroup_reserved, &ins); out: btrfs_free_path(path); return ret; } static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); spin_lock(&cache->lock); cache->delalloc_bytes -= len; spin_unlock(&cache->lock); btrfs_put_block_group(cache); } static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; u64 logical_len; bool update_inode_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) logical_len = oe->truncated_len; else logical_len = oe->num_bytes; btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ /* * For delalloc, when completing an ordered extent we update the inode's * bytes when clearing the range in the inode's io tree, so pass false * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), * except if the ordered extent was truncated. */ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), oe->file_offset, &stack_fi, update_inode_bytes, oe->qgroup_rsv); } /* * As ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. */ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 start, end; int compress_type = 0; int ret = 0; u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; bool clear_reserved_extent = true; unsigned int clear_bits = EXTENT_DEFRAG; start = ordered_extent->file_offset; end = start + ordered_extent->num_bytes - 1; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; goto out; } if (ordered_extent->bdev) btrfs_rewrite_logical_zoned(ordered_extent); btrfs_free_io_failure_record(inode, start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; /* Truncated the entire extent, don't bother adding */ if (!logical_len) goto out; } if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ btrfs_inode_safe_disk_i_size_write(inode, 0); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } trans->block_rsv = &inode->block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } clear_bits |= EXTENT_LOCKED; lock_extent_bits(io_tree, start, end, &cached_state); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } trans->block_rsv = &inode->block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + logical_len); btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } } unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } ret = add_pending_csums(trans, &ordered_extent->list); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } /* * If this is a new delalloc range, clear its new delalloc flag to * update the inode's number of bytes. This needs to be done first * before updating the inode item. */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) clear_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, 0, 0, &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } ret = 0; out: clear_extent_bit(&inode->io_tree, start, end, clear_bits, (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { u64 unwritten_start = start; /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered * extent, and mark the inode with the error if it wasn't * already set. Any error during writeback would have already * set the mapping error, so we need to set it if we're the ones * marking this ordered extent as failed. */ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) mapping_set_error(ordered_extent->inode->i_mapping, -EIO); if (truncated) unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); /* * Drop extent maps for the part of the extent we didn't write. * * We have an exception here for the free_space_inode, this is * because when we do btrfs_get_extent() on the free space inode * we will search the commit root. If this is a new block group * we won't find anything, and we will trip over the assert in * writepage where we do ASSERT(em->block_start != * EXTENT_MAP_HOLE). * * Theoretically we could also skip this for any NOCOW extent as * we don't mess with the extent map tree in the NOCOW case, but * for now simply skip this if we are the free space inode. */ if (!btrfs_is_free_space_inode(inode)) btrfs_drop_extent_cache(inode, unwritten_start, end, 0); /* * If the ordered extent had an IOERR or something else went * wrong we need to return the space for this ordered extent * back to the allocator. We only free the extent in the * truncated case if we didn't write out the extent at all. * * If we made it past insert_reserved_file_extent before we * errored out then we don't need to do this as the accounting * has already been done. */ if ((ret || !logical_len) && clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { /* * Discard the range before returning it back to the * free space pool */ if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) btrfs_discard_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, 1); /* * Actually free the qgroup rsv which was released when * the ordered extent was created. */ btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, ordered_extent->qgroup_rsv, BTRFS_QGROUP_RSV_DATA); } } /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); return ret; } static void finish_ordered_fn(struct btrfs_work *work) { struct btrfs_ordered_extent *ordered_extent; ordered_extent = container_of(work, struct btrfs_ordered_extent, work); btrfs_finish_ordered_io(ordered_extent); } void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate) { trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, finish_ordered_fn, uptodate); } /* * check_data_csum - verify checksum of one sector of uncompressed data * @inode: inode * @io_bio: btrfs_io_bio which contains the csum * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page * @start: logical offset in the file * * The length of such check is always one sector size. */ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, u32 bio_offset, struct page *page, u32 pgoff, u64 start) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; u32 len = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; unsigned int offset_sectors; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; ASSERT(pgoff + len <= PAGE_SIZE); offset_sectors = bio_offset >> fs_info->sectorsize_bits; csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size; kaddr = kmap_atomic(page); shash->tfm = fs_info->csum_shash; crypto_shash_digest(shash, kaddr + pgoff, len, csum); if (memcmp(csum, csum_expected, csum_size)) goto zeroit; kunmap_atomic(kaddr); return 0; zeroit: btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, io_bio->mirror_num); if (io_bio->device) btrfs_dev_stat_inc_and_print(io_bio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); kunmap_atomic(kaddr); return -EIO; } /* * When reads are done, we need to check csums to verify the data is correct. * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. * * @bio_offset: offset to the beginning of the bio (in bytes) * @start: file offset of the range start * @end: file offset of the range end (inclusive) * * Return a bitmap where bit set means a csum mismatch, and bit not set means * csum match. */ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; unsigned int result = 0; if (PageChecked(page)) { ClearPageChecked(page); return 0; } /* * For subpage case, above PageChecked is not safe as it's not subpage * compatible. * But for now only cow fixup and compressed read utilize PageChecked * flag, while in this context we can easily use io_bio->csum to * determine if we really need to do csum verification. * * So for now, just exit if io_bio->csum is NULL, as it means it's * compressed read, and its compressed data csum has already been * verified. */ if (io_bio->csum == NULL) return 0; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; if (!root->fs_info->csum_root) return 0; ASSERT(page_offset(page) <= start && end <= page_offset(page) + PAGE_SIZE - 1); for (pg_off = offset_in_page(start); pg_off < offset_in_page(end); pg_off += sectorsize, bio_offset += sectorsize) { u64 file_offset = pg_off + page_offset(page); int ret; if (btrfs_is_data_reloc_root(root) && test_range_bit(io_tree, file_offset, file_offset + sectorsize - 1, EXTENT_NODATASUM, 1, NULL)) { /* Skip the range without csum for data reloc inode */ clear_extent_bits(io_tree, file_offset, file_offset + sectorsize - 1, EXTENT_NODATASUM); continue; } ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, page_offset(page) + pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; result |= (1U << nr_bit); } } return result; } /* * btrfs_add_delayed_iput - perform a delayed iput on @inode * * @inode: The inode we want to perform iput on * * This function uses the generic vfs_inode::i_count to track whether we should * just decrement it (in case it's > 1) or if this is the last iput then link * the inode to the delayed iput machinery. Delayed iputs are processed at * transaction commit time/superblock commit/cleaner kthread. */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_inode *binode = BTRFS_I(inode); if (atomic_add_unless(&inode->i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); spin_lock(&fs_info->delayed_iput_lock); ASSERT(list_empty(&binode->delayed_iput)); list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); } static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { list_del_init(&inode->delayed_iput); spin_unlock(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) wake_up(&fs_info->delayed_iputs_wait); spin_lock(&fs_info->delayed_iput_lock); } static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { if (!list_empty(&inode->delayed_iput)) { spin_lock(&fs_info->delayed_iput_lock); if (!list_empty(&inode->delayed_iput)) run_delayed_iput_locked(fs_info, inode); spin_unlock(&fs_info->delayed_iput_lock); } } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { spin_lock(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); cond_resched_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); } /** * Wait for flushing all delayed iputs * * @fs_info: the filesystem * * This will wait on any delayed iputs that are currently running with KILLABLE * set. Once they are all done running we will return, unless we are killed in * which case we return EINTR. This helps in user operations like fallocate etc * that might get blocked on the iputs. * * Return EINTR if we were killed, 0 if nothing's pending */ int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) { int ret = wait_event_killable(fs_info->delayed_iputs_wait, atomic_read(&fs_info->nr_delayed_iputs) == 0); if (ret) return -EINTR; return 0; } /* * This creates an orphan entry for the given inode in case something goes wrong * in the middle of an unlink. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { int ret; ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); if (ret && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); return ret; } return 0; } /* * We have done the delete so we can go ahead and remove the orphan item for * this particular inode. */ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); } /* * this cleans up any orphans that may be left on the list from the last use * of this root. */ int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) return 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } path->reada = READA_BACK; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; /* * if ret == 0 means we found what we were searching for, which * is weird, but possible, so only screw with path if we didn't * find the key and see if we have stuff that matches */ if (ret > 0) { ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; } /* pull out the item */ leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* make sure the item matches what we want */ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) break; if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) break; /* release the path since we're done with it */ btrfs_release_path(path); /* * this is where we are basically btrfs_lookup, without the * crossing root thing. we store the inode number in the * offset of the orphan item. */ if (found_key.offset == last_objectid) { btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup"); ret = -EINVAL; goto out; } last_objectid = found_key.offset; found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(fs_info->sb, last_objectid, root); ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ENOENT) goto out; if (ret == -ENOENT && root == fs_info->tree_root) { struct btrfs_root *dead_root; int is_dead_root = 0; /* * This is an orphan in the tree root. Currently these * could come from 2 sources: * a) a root (snapshot/subvolume) deletion in progress * b) a free space cache inode * We need to distinguish those two, as the orphan item * for a root must not get deleted before the deletion * of the snapshot/subvolume's tree completes. * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into * fs_info->fs_roots_radix. So here we can find if an * orphan item corresponds to a deleted root by looking * up the root from that radix tree. */ spin_lock(&fs_info->fs_roots_radix_lock); dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; spin_unlock(&fs_info->fs_roots_radix_lock); if (is_dead_root) { /* prevent this orphan from being found again */ key.offset = found_key.objectid - 1; continue; } } /* * If we have an inode with links, there are a couple of * possibilities: * * 1. We were halfway through creating fsverity metadata for the * file. In that case, the orphan item represents incomplete * fsverity metadata which must be cleaned up with * btrfs_drop_verity_items and deleting the orphan item. * 2. Old kernels (before v3.12) used to create an * orphan item for truncate indicating that there were possibly * extent items past i_size that needed to be deleted. In v3.12, * truncate was changed to update i_size in sync with the extent * items, but the (useless) orphan item was still created. Since * v4.18, we don't create the orphan item for truncate at all. * * So, this item could mean that we need to do a truncate, but * only if this filesystem was last used on a pre-v3.12 kernel * and was not cleanly unmounted. The odds of that are quite * slim, and it's a pain to do the truncate now, so just delete * the orphan item. * * It's also possible that this orphan item was supposed to be * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ if (ret == -ENOENT || inode->i_nlink) { if (!ret) { ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); if (ret) goto out; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } btrfs_debug(fs_info, "auto deleting %Lu", found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); btrfs_end_transaction(trans); if (ret) goto out; continue; } nr_unlink++; /* this will do delete_inode and everything for us */ iput(inode); } /* release the path since we're done with it */ btrfs_release_path(path); root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans); } if (nr_unlink) btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); out: if (ret) btrfs_err(fs_info, "could not do orphan cleanup %d", ret); btrfs_free_path(path); return ret; } /* * very simple check to peek ahead in the leaf looking for xattrs. If we * don't find any xattrs, we know there can't be any acls. * * slot is the slot the inode is in, objectid is the objectid of the inode */ static noinline int acls_after_inode_item(struct extent_buffer *leaf, int slot, u64 objectid, int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; static u64 xattr_access = 0; static u64 xattr_default = 0; int scanned = 0; if (!xattr_access) { xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, strlen(XATTR_NAME_POSIX_ACL_ACCESS)); xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); } slot++; *first_xattr_slot = -1; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); /* we found a different objectid, there must not be acls */ if (found_key.objectid != objectid) return 0; /* we found an xattr, assume we've got an acl */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1) *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) return 1; } /* * we found a key greater than an xattr key, there can't * be any acls later on */ if (found_key.type > BTRFS_XATTR_ITEM_KEY) return 0; slot++; scanned++; /* * it goes inode, inode backrefs, xattrs, extents, * so if there are a ton of hard links to an inode there can * be a lot of backrefs. Don't waste time searching too hard, * this is just an optimization */ if (scanned >= 8) break; } /* we hit the end of the leaf before we found an xattr or * something larger than an xattr. We have to assume the inode * has acls */ if (*first_xattr_slot == -1) *first_xattr_slot = slot; return 1; } /* * read an inode from the btree into the in-memory inode */ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *in_path) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; unsigned long ptr; int maybe_acls; u32 rdev; int ret; bool filled = false; int first_xattr_slot; ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; if (!path) { path = btrfs_alloc_path(); if (!path) return -ENOMEM; } memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { if (path != in_path) btrfs_free_path(path); return ret; } leaf = path->nodes[0]; if (filled) goto cache_index; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); inode->i_mode = btrfs_inode_mode(leaf, inode_item); set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, round_up(i_size_read(inode), fs_info->sectorsize)); inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); BTRFS_I(inode)->i_otime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->otime); BTRFS_I(inode)->i_otime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); inode_set_iversion_queried(inode, btrfs_inode_sequence(leaf, inode_item)); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); BTRFS_I(inode)->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); cache_index: /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any * idea about which extents were modified before we were evicted from * cache. * * This is required for both inode re-read from disk and delayed inode * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation * against the inode was last made. So here we assume the inode might * have been evicted, and therefore the exact value of last_unlink_trans * lost, and set it to last_trans to avoid metadata inconsistencies * between the inode and its parent if the inode is fsync'ed and the log * replayed. For example, in the scenario: * * touch mydir/foo * ln mydir/foo mydir/bar * sync * unlink mydir/bar * echo 2 > /proc/sys/vm/drop_caches # evicts inode * xfs_io -c fsync mydir/foo * <power failure> * mount fs, triggers fsync log replay * * We must make sure that when we fsync our inode foo we also log its * parent inode, otherwise after log replay the parent still has the * dentry with the "bar" name but our inode foo has a link count of 1 * and doesn't have an inode ref with the name "bar" anymore. * * Setting last_unlink_trans to last_trans is a pessimistic approach, * but it guarantees correctness at the expense of occasional full * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation * of the last transaction where this inode was used for a reflink * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); if (location.objectid != btrfs_ino(BTRFS_I(inode))) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); if (location.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); } if (path != in_path) btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: inode->i_fop = &btrfs_dir_file_operations; inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_aops; break; default: inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); return 0; } /* * given a leaf and an inode, copy the inode fields into the leaf */ static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, struct inode *inode) { struct btrfs_map_token token; u64 flags; btrfs_init_map_token(&token, leaf); btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); btrfs_set_token_inode_mode(&token, item, inode->i_mode); btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, inode->i_atime.tv_sec); btrfs_set_token_timespec_nsec(&token, &item->atime, inode->i_atime.tv_nsec); btrfs_set_token_timespec_sec(&token, &item->mtime, inode->i_mtime.tv_sec); btrfs_set_token_timespec_nsec(&token, &item->mtime, inode->i_mtime.tv_nsec); btrfs_set_token_timespec_sec(&token, &item->ctime, inode->i_ctime.tv_sec); btrfs_set_token_timespec_nsec(&token, &item->ctime, inode->i_ctime.tv_nsec); btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime.tv_sec); btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime.tv_nsec); btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); btrfs_set_token_inode_generation(&token, item, BTRFS_I(inode)->generation); btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, BTRFS_I(inode)->ro_flags); btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } /* * copy everything in the in-memory inode into the btree. */ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; struct btrfs_path *path; struct extent_buffer *leaf; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); if (ret) { if (ret > 0) ret = -ENOENT; goto failed; } leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); btrfs_mark_buffer_dirty(leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: btrfs_free_path(path); return ret; } /* * copy everything in the in-memory inode into the btree. */ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; /* * If the inode is a free space inode, we can deadlock during commit * if we put it into the delayed code. * * The data relocation inode should also be directly updated * without delay */ if (!btrfs_is_free_space_inode(inode) && !btrfs_is_data_reloc_root(root) && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); return ret; } return btrfs_update_inode_item(trans, root, inode); } int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode) { int ret; ret = btrfs_update_inode(trans, root, inode); if (ret == -ENOSPC) return btrfs_update_inode_item(trans, root, inode); return ret; } /* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory */ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len) { struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; struct btrfs_dir_item *di; u64 index; u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto err; } ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto err; btrfs_release_path(path); /* * If we don't have dir index, we have to get it by looking up * the inode ref, since we get the inode ref, remove it directly, * it is unnecessary to do delayed deletion. * * But if we have dir index, needn't search inode ref to get it. * Since the inode ref is close to the inode item, it is better * that we delay to delete it, and just do this deletion when * we update the inode item. */ if (inode->dir_index) { ret = btrfs_delayed_delete_inode_ref(inode); if (!ret) { index = inode->dir_index; goto skip_backref; } } ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, dir_ino, &index); if (ret) { btrfs_info(fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", name_len, name, ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } skip_backref: ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto err; } ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir_ino); if (ret != 0 && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto err; } ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); if (ret == -ENOENT) ret = 0; else if (ret) btrfs_abort_transaction(trans, ret); /* * If we have a pending delayed iput we could end up with the final iput * being run in btrfs-cleaner context. If we have enough of these built * up we can end up burning a lot of time in btrfs-cleaner without any * way to throttle the unlinks. Since we're currently holding a ref on * the inode we can run the delayed iput here without any issues as the * final iput won't be done until after we drop the ref we're currently * holding. */ btrfs_run_delayed_iput(fs_info, inode); err: btrfs_free_path(path); if (ret) goto out; btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); inode_inc_iversion(&inode->vfs_inode); inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); ret = btrfs_update_inode(trans, root, dir); out: return ret; } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len) { int ret; ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode->root, inode); } return ret; } /* * helper to start transaction for unlink and rmdir. * * unlink and rmdir are special in btrfs, they do not always free space, so * if we cannot make our reservations the normal way try and see if there is * plenty of slack room in the global reserve to migrate, otherwise we cannot * allow the unlink to occur. */ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) { struct btrfs_root *root = BTRFS_I(dir)->root; /* * 1 for the possible orphan item * 1 for the dir item * 1 for the dir index * 1 for the inode ref * 1 for the inode */ return btrfs_start_transaction_fallback_global_rsv(root, 5); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { struct btrfs_trans_handle *trans; struct inode *inode = d_inode(dentry); int ret; trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 0); ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), dentry->d_name.name, dentry->d_name.len); if (ret) goto out; if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) goto out; } out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); return ret; } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct inode *dir, struct dentry *dentry) { struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; const char *name = dentry->d_name.name; int name_len = dentry->d_name.len; u64 index; int ret; u64 objectid; u64 dir_ino = btrfs_ino(BTRFS_I(dir)); if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { objectid = inode->root->root_key.objectid; } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { objectid = inode->location.objectid; } else { WARN_ON(1); return -EINVAL; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, name_len, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; } leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); /* * This is a placeholder inode for a subvolume we didn't have a * reference to at the time of the snapshot creation. In the meantime * we could have renamed the real subvol link into our snapshot, so * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. * Instead simply lookup the dir_index_item for this entry so we can * remove it. Otherwise we know we have a ref to the root and we can * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, name, name_len); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; else ret = PTR_ERR(di); btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); index = key.offset; btrfs_release_path(path); } else { ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, dir_ino, &index, name, name_len); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); dir->i_mtime = dir->i_ctime = current_time(dir); ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); if (ret) btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; } /* * Helper to check if the subvolume references other subvolumes or if it's * default. */ static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_dir_item *di; struct btrfs_key key; u64 dir_id; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* Make sure this root isn't set as the default subvol */ dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, dir_id, "default", 7, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == root->root_key.objectid) { ret = -EPERM; btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", key.objectid); goto out; } btrfs_release_path(path); } key.objectid = root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ ret = -EUCLEAN; goto out; } ret = 0; if (path->slots[0] > 0) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == root->root_key.objectid && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } out: btrfs_free_path(path); return ret; } /* Delete all dentries for inodes belonging to the root */ static void btrfs_prune_dentries(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; struct rb_node *prev; struct btrfs_inode *entry; struct inode *inode; u64 objectid = 0; if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); spin_lock(&root->inode_lock); again: node = root->inode_tree.rb_node; prev = NULL; while (node) { prev = node; entry = rb_entry(node, struct btrfs_inode, rb_node); if (objectid < btrfs_ino(entry)) node = node->rb_left; else if (objectid > btrfs_ino(entry)) node = node->rb_right; else break; } if (!node) { while (prev) { entry = rb_entry(prev, struct btrfs_inode, rb_node); if (objectid <= btrfs_ino(entry)) { node = prev; break; } prev = rb_next(prev); } } while (node) { entry = rb_entry(node, struct btrfs_inode, rb_node); objectid = btrfs_ino(entry) + 1; inode = igrab(&entry->vfs_inode); if (inode) { spin_unlock(&root->inode_lock); if (atomic_read(&inode->i_count) > 1) d_prune_aliases(inode); /* * btrfs_drop_inode will have it removed from the inode * cache when its usage count hits zero. */ iput(inode); cond_resched(); spin_lock(&root->inode_lock); goto again; } if (cond_resched_lock(&root->inode_lock)) goto again; node = rb_next(node); } spin_unlock(&root->inode_lock); } int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; u64 root_flags; int ret; down_write(&fs_info->subvol_sem); /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit * again is not run concurrently. */ spin_lock(&dest->root_item_lock); if (dest->send_in_progress) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); ret = -EPERM; goto out_up_write; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", root->root_key.objectid); ret = -EPERM; goto out_up_write; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); ret = may_destroy_subvol(dest); if (ret) goto out_undead; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * One for dir inode, * two for dir entries, * two for root ref/backref. */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) goto out_undead; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } ret = btrfs_record_root_in_trans(trans, dest); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } memset(&dest->root_item.drop_progress, 0, sizeof(dest->root_item.drop_progress)); btrfs_set_root_drop_level(&dest->root_item, 0); btrfs_set_root_refs(&dest->root_item, 0); if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, dest->root_key.objectid); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { ret = btrfs_uuid_tree_remove(trans, dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } free_anon_bdev(dest->anon_dev); dest->anon_dev = 0; out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; ret = btrfs_end_transaction(trans); inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); out_undead: if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); } out_up_write: up_write(&fs_info->subvol_sem); if (!ret) { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); } return ret; } static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int err = 0; struct btrfs_trans_handle *trans; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) return btrfs_delete_subvolume(dir, dentry); trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); /* * Propagate the last_unlink_trans value of the deleted dir to its * parent directory. This is to prevent an unrecoverable log tree in the * case we do something like this: * 1) create dir foo * 2) create snapshot under dir foo * 3) delete the snapshot * 4) rmdir foo * 5) mkdir foo * 6) fsync foo or some file inside foo * * This is because we can't unlink other roots when replaying the dir * deletes for directory foo. */ if (BTRFS_I(inode)->last_unlink_trans >= trans->transid) btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { err = btrfs_unlink_subvol(trans, dir, dentry); goto out; } err = btrfs_orphan_add(trans, BTRFS_I(inode)); if (err) goto out; /* now the directory is empty */ err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), dentry->d_name.name, dentry->d_name.len); if (!err) btrfs_i_size_write(BTRFS_I(inode), 0); out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); return err; } /* * Return this if we need to call truncate_block for the last bit of the * truncate. */ #define NEED_TRUNCATE_BLOCK 1 /* * Remove inode items from a given root. * * @trans: A transaction handle. * @root: The root from which to remove items. * @inode: The inode whose items we want to remove. * @new_size: The new i_size for the inode. This is only applicable when * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. * @min_type: The minimum key type to remove. All keys with a type * greater than this value are removed and all keys with * this type are removed only if their offset is >= @new_size. * @extents_found: Output parameter that will contain the number of file * extent items that were removed or adjusted to the new * inode i_size. The caller is responsible for initializing * the counter. Also, it can be NULL if the caller does not * need this counter. * * Remove all keys associated with the inode from the given root that have a key * with a type greater than or equals to @min_type. When @min_type has a value of * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value * greater than or equals to @new_size. If a file extent item that starts before * @new_size and ends after it is found, its length is adjusted. * * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. */ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, u64 new_size, u32 min_type, u64 *extents_found) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct btrfs_key key; struct btrfs_key found_key; u64 extent_start = 0; u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; u64 last_size = new_size; u32 found_type = (u8)-1; int found_extent; int del_item; int pending_del_nr = 0; int pending_del_slot = 0; int extent_type = -1; int ret; u64 ino = btrfs_ino(inode); u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); struct extent_state *cached_state = NULL; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); /* * For non-free space inodes and non-shareable roots, we want to back * off from time to time. This means all inodes in subvolume roots, * reloc roots, and data reloc roots. */ if (!btrfs_is_free_space_inode(inode) && test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) be_nice = true; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_BACK; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { lock_extent_bits(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this * new size is not block aligned since we will be keeping the * last block of the extent just the way it is. */ btrfs_drop_extent_cache(inode, ALIGN(new_size, fs_info->sectorsize), (u64)-1, 0); } /* * This function is also used to drop the items in the log tree before * we relog the inode, so if root != BTRFS_I(inode)->root, it means * it is used to drop the logged items. So we shouldn't kill the delayed * items. */ if (min_type == 0 && root == inode->root) btrfs_kill_delayed_inode_items(inode); key.objectid = ino; key.offset = (u64)-1; key.type = (u8)-1; search_again: /* * with a 16K leaf size and 128MB extents, you can actually queue * up a huge file in a single leaf. Most of the time that * bytes_deleted is > 0, it will be huge by the time we get here */ if (be_nice && bytes_deleted > SZ_32M && btrfs_should_end_transaction(trans)) { ret = -EAGAIN; goto out; } ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0) { ret = 0; /* there are no items in the tree for us to truncate, we're * done */ if (path->slots[0] == 0) goto out; path->slots[0]--; } while (1) { u64 clear_start = 0, clear_len = 0; fi = NULL; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); found_type = found_key.type; if (found_key.objectid != ino) break; if (found_type < min_type) break; item_end = found_key.offset; if (found_type == BTRFS_EXTENT_DATA_KEY) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); trace_btrfs_truncate_show_fi_regular( inode, leaf, fi, found_key.offset); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_ram_bytes(leaf, fi); trace_btrfs_truncate_show_fi_inline( inode, leaf, fi, path->slots[0], found_key.offset); } item_end--; } if (found_type > min_type) { del_item = 1; } else { if (item_end < new_size) break; if (found_key.offset >= new_size) del_item = 1; else del_item = 0; } found_extent = 0; /* FIXME, shrink the extent if the ref count is only 1 */ if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; if (extents_found != NULL) (*extents_found)++; if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; clear_start = found_key.offset; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); if (!del_item) { u64 orig_num_bytes = btrfs_file_extent_num_bytes(leaf, fi); extent_num_bytes = ALIGN(new_size - found_key.offset, fs_info->sectorsize); clear_start = ALIGN(new_size, fs_info->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && extent_start != 0) inode_sub_bytes(&inode->vfs_inode, num_dec); btrfs_mark_buffer_dirty(leaf); } else { extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); extent_offset = found_key.offset - btrfs_file_extent_offset(leaf, fi); /* FIXME blocksize != 4096 */ num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { found_extent = 1; if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) inode_sub_bytes(&inode->vfs_inode, num_dec); } } clear_len = num_dec; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* * we can't truncate inline items that have had * special encodings */ if (!del_item && btrfs_file_extent_encryption(leaf, fi) == 0 && btrfs_file_extent_other_encoding(leaf, fi) == 0 && btrfs_file_extent_compression(leaf, fi) == 0) { u32 size = (u32)(new_size - found_key.offset); btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); btrfs_truncate_item(path, size, 1); } else if (!del_item) { /* * We have to bail so the last_size is set to * just before this extent. */ ret = NEED_TRUNCATE_BLOCK; break; } else { /* * Inline extents are special, we just treat * them as a full sector worth in the file * extent tree just for simplicity sake. */ clear_len = fs_info->sectorsize; } if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) inode_sub_bytes(&inode->vfs_inode, item_end + 1 - new_size); } delete: /* * We use btrfs_truncate_inode_items() to clean up log trees for * multiple fsyncs, and in this case we don't want to clear the * file extent range because it's just the log. */ if (root == inode->root) { ret = btrfs_inode_clear_file_extent_range(inode, clear_start, clear_len); if (ret) { btrfs_abort_transaction(trans, ret); break; } } if (del_item) last_size = found_key.offset; else last_size = new_size; if (del_item) { if (!pending_del_nr) { /* no pending yet, add ourselves */ pending_del_slot = path->slots[0]; pending_del_nr = 1; } else if (pending_del_nr && path->slots[0] + 1 == pending_del_slot) { /* hop on the pending chunk */ pending_del_nr++; pending_del_slot = path->slots[0]; } else { BUG(); } } else { break; } should_throttle = false; if (found_extent && root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_ref ref = { 0 }; bytes_deleted += extent_num_bytes; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, extent_start, extent_num_bytes, 0); ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), ino, extent_offset, root->root_key.objectid, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } if (be_nice) { if (btrfs_should_throttle_delayed_refs(trans)) should_throttle = true; } } if (found_type == BTRFS_INODE_ITEM_KEY) break; if (path->slots[0] == 0 || path->slots[0] != pending_del_slot || should_throttle) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); if (ret) { btrfs_abort_transaction(trans, ret); break; } pending_del_nr = 0; } btrfs_release_path(path); /* * We can generate a lot of delayed refs, so we need to * throttle every once and a while and make sure we're * adding enough space to keep up with the work we are * generating. Since we hold a transaction here we * can't flush, and we don't want to FLUSH_LIMIT because * we could have generated too many delayed refs to * actually allocate, so just bail if we're short and * let the normal reservation dance happen higher up. */ if (should_throttle) { ret = btrfs_delayed_refs_rsv_refill(fs_info, BTRFS_RESERVE_NO_FLUSH); if (ret) { ret = -EAGAIN; break; } } goto search_again; } else { path->slots[0]--; } } out: if (ret >= 0 && pending_del_nr) { int err; err = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); if (err) { btrfs_abort_transaction(trans, err); ret = err; } } if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ASSERT(last_size >= new_size); if (!ret && last_size > new_size) last_size = new_size; btrfs_inode_safe_disk_i_size_write(inode, last_size); unlock_extent_cached(&inode->io_tree, lock_start, (u64)-1, &cached_state); } btrfs_free_path(path); return ret; } /* * btrfs_truncate_block - read, zero a chunk and write a block * @inode - inode that we're zeroing * @from - the offset to start zeroing * @len - the length to zero, 0 to zero the entire range respective to the * offset * @front - zero up to the offset instead of from the offset on * * This will find the block for the "from" offset and cow the block and zero the * part we want to zero. This is used with truncate and hole punching. */ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end; if (IS_ALIGNED(offset, blocksize) && (!len || IS_ALIGNED(len, blocksize))) goto out; block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, blocksize); if (ret < 0) { if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) { /* For nocow case, no need to reserve data space */ only_release_metadata = true; } else { goto out; } } ret = btrfs_delalloc_reserve_metadata(inode, blocksize); if (ret < 0) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, data_reserved, block_start, blocksize); goto out; } again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); ret = -ENOMEM; goto out; } ret = set_page_extent_mapped(page); if (ret < 0) goto out_unlock; if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); lock_page(page); if (page->mapping != mapping) { unlock_page(page); put_page(page); goto again; } if (!PageUptodate(page)) { ret = -EIO; goto out_unlock; } } wait_on_page_writeback(page); lock_extent_bits(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } clear_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); goto out_unlock; } if (offset != blocksize) { if (!len) len = blocksize - offset; if (front) memzero_page(page, (block_start - page_offset(page)), offset); else memzero_page(page, (block_start - page_offset(page)) + offset, len); flush_dcache_page(page); } ClearPageChecked(page); btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) set_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL); out_unlock: if (ret) { if (only_release_metadata) btrfs_delalloc_release_metadata(inode, blocksize, true); else btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); } btrfs_delalloc_release_extents(inode, blocksize); unlock_page(page); put_page(page); out: if (only_release_metadata) btrfs_check_nocow_unlock(inode); extent_changeset_free(data_reserved); return ret; } static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, u64 offset, u64 len) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_drop_extents_args drop_args = { 0 }; int ret; /* * If NO_HOLES is enabled, we don't need to do anything. * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() * or btrfs_update_inode() will be called, which guarantee that the next * fsync will know this inode was changed and needs to be logged. */ if (btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; /* * 1 - for the one we're dropping * 1 - for the one we're adding * 1 - for updating the inode. */ trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) return PTR_ERR(trans); drop_args.start = offset; drop_args.end = offset + len; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 0, 0, len, 0, len, 0, 0, 0); if (ret) { btrfs_abort_transaction(trans, ret); } else { btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); btrfs_update_inode(trans, root, inode); } btrfs_end_transaction(trans); return ret; } /* * This function puts in dummy file extents for the area we're creating a hole * for. So if we are truncating this file to a larger size we need to insert * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for * the range between oldsize and size */ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); u64 block_end = ALIGN(size, fs_info->sectorsize); u64 last_byte; u64 cur_offset; u64 hole_size; int err = 0; /* * If our size started in the middle of a block we need to zero out the * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ err = btrfs_truncate_block(inode, oldsize, 0, 0); if (err) return err; if (size <= hole_start) return 0; btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, &cached_state); cur_offset = hole_start; while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; break; } last_byte = min(extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; err = maybe_insert_hole(root, inode, cur_offset, hole_size); if (err) break; err = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); if (err) break; btrfs_drop_extent_cache(inode, cur_offset, cur_offset + hole_size - 1, 0); hole_em = alloc_extent_map(); if (!hole_em) { set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); goto next; } hole_em->start = cur_offset; hole_em->len = hole_size; hole_em->orig_start = cur_offset; hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = fs_info->generation; while (1) { write_lock(&em_tree->lock); err = add_extent_mapping(em_tree, hole_em, 1); write_unlock(&em_tree->lock); if (err != -EEXIST) break; btrfs_drop_extent_cache(inode, cur_offset, cur_offset + hole_size - 1, 0); } free_extent_map(hole_em); } else { err = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); if (err) break; } next: free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } free_extent_map(em); unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state); return err; } static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); loff_t newsize = attr->ia_size; int mask = attr->ia_valid; int ret; /* * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ if (newsize != oldsize) { inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) inode->i_ctime = inode->i_mtime = current_time(inode); } if (newsize > oldsize) { /* * Don't do an expanding truncate while snapshotting is ongoing. * This is to ensure the snapshot captures a fully consistent * state of this file - if the snapshot captures this expanding * truncation, it must capture all writes that happened before * this truncation. */ btrfs_drew_write_lock(&root->snapshot_lock); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); if (ret) { btrfs_drew_write_unlock(&root->snapshot_lock); return ret; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_drew_write_unlock(&root->snapshot_lock); return PTR_ERR(trans); } i_size_write(inode, newsize); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); if (btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, ALIGN(newsize, fs_info->sectorsize), (u64)-1); if (ret) return ret; } /* * We're truncating a file that used to have good data down to * zero. Make sure any new writes to the file get on disk * on close. */ if (newsize == 0) set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize); inode_dio_wait(inode); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { int err; /* * Truncate failed, so fix up the in-memory size. We * adjusted disk_i_size down as we removed extents, so * wait for disk_i_size to be stable and then update the * in-memory size to match. */ err = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (err) return err; i_size_write(inode, BTRFS_I(inode)->disk_i_size); } } return ret; } static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; int err; if (btrfs_root_readonly(root)) return -EROFS; err = setattr_prepare(mnt_userns, dentry, attr); if (err) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { err = btrfs_setsize(inode, attr); if (err) return err; } if (attr->ia_valid) { setattr_copy(mnt_userns, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); } return err; } /* * While truncating the inode pages during eviction, we get the VFS calling * btrfs_invalidatepage() against each page of the inode. This is slow because * the calls to btrfs_invalidatepage() result in a huge amount of calls to * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting * extent_state structures over and over, wasting lots of time. * * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all * those expensive operations on a per page basis and do only the ordered io * finishing, while we release here the extent_map and extent_state structures, * without the excessive merging and splitting. */ static void evict_inode_truncate_pages(struct inode *inode) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree; struct rb_node *node; ASSERT(inode->i_state & I_FREEING); truncate_inode_pages_final(&inode->i_data); write_lock(&map_tree->lock); while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) { struct extent_map *em; node = rb_first_cached(&map_tree->map); em = rb_entry(node, struct extent_map, rb_node); clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &em->flags); remove_extent_mapping(map_tree, em); free_extent_map(em); if (need_resched()) { write_unlock(&map_tree->lock); cond_resched(); write_lock(&map_tree->lock); } } write_unlock(&map_tree->lock); /* * Keep looping until we have no more ranges in the io tree. * We can have ongoing bios started by readahead that have * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before * submitting those bios, which are executed by a separate task (work * queue kthread), inode references (inode->i_count) were not taken * (which would be dropped in the end io callback of each bio). * Therefore here we effectively end up waiting for those bios and * anyone else holding locked ranges without having bumped the inode's * reference count - if we don't do it, when they access the inode's * io_tree to unlock a range it may be too late, leading to an * use-after-free issue. */ spin_lock(&io_tree->lock); while (!RB_EMPTY_ROOT(&io_tree->state)) { struct extent_state *state; struct extent_state *cached_state = NULL; u64 start; u64 end; unsigned state_flags; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); start = state->start; end = state->end; state_flags = state->state; spin_unlock(&io_tree->lock); lock_extent_bits(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, * and its reserved space won't be freed by delayed_ref. * So we need to free its reserved space here. * (Refer to comment in btrfs_invalidatepage, case 2) * * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, end - start + 1); clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state); cond_resched(); spin_lock(&io_tree->lock); } spin_unlock(&io_tree->lock); } static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_trans_handle *trans; u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); int ret; /* * Eviction should be taking place at some place safe because of our * delayed iputs. However the normal flushing code will run delayed * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. * * We reserve the delayed_refs_extra here again because we can't use * btrfs_start_transaction(root, 0) for the same deadlocky reason as * above. We reserve our extra bit here because we generate a ton of * delayed refs activity by truncating. * * If we cannot make our reservation we'll attempt to steal from the * global reserve, because we really want to be able to free up space. */ ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra, BTRFS_RESERVE_FLUSH_EVICT); if (ret) { /* * Try to steal from the global reserve if there is space for * it. */ if (btrfs_check_space_for_delayed_refs(fs_info) || btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) { btrfs_warn(fs_info, "could not allocate space for delete; will truncate on mount"); return ERR_PTR(-ENOSPC); } delayed_refs_extra = 0; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return trans; if (delayed_refs_extra) { trans->block_rsv = &fs_info->trans_block_rsv; trans->bytes_reserved = delayed_refs_extra; btrfs_block_rsv_migrate(rsv, trans->block_rsv, delayed_refs_extra, 1); } return trans; } void btrfs_evict_inode(struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; int ret; trace_btrfs_inode_evict(inode); if (!root) { fsverity_cleanup_inode(inode); clear_inode(inode); return; } evict_inode_truncate_pages(inode); if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || btrfs_is_free_space_inode(BTRFS_I(inode)))) goto no_delete; if (is_bad_inode(inode)) goto no_delete; btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1); if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto no_delete; if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); goto no_delete; } ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) goto no_delete; rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto no_delete; rsv->size = btrfs_calc_metadata_size(fs_info, 1); rsv->failfast = 1; btrfs_i_size_write(BTRFS_I(inode), 0); while (1) { trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) goto free_rsv; trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 0, 0, NULL); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (ret && ret != -ENOSPC && ret != -EAGAIN) goto free_rsv; else if (!ret) break; } /* * Errors here aren't a big deal, it just means we leave orphan items in * the tree. They will be cleaned up on the next mount. If the inode * number gets reused, cleanup deletes the orphan item without doing * anything, and unlink reuses the existing orphan item. * * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ trans = evict_refill_and_join(root, rsv); if (!IS_ERR(trans)) { trans->block_rsv = rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); } free_rsv: btrfs_free_block_rsv(fs_info, rsv); no_delete: /* * If we didn't successfully delete, the orphan item will still be in * the tree and we'll retry on the next mount. Again, we might also want * to retry these periodically in the future. */ btrfs_remove_delayed_node(BTRFS_I(inode)); fsverity_cleanup_inode(inode); clear_inode(inode); } /* * Return the key found in the dir entry in the location pointer, fill @type * with BTRFS_FT_*, and return 0. * * If no dir entries were found, returns -ENOENT. * If found a corrupted location in dir entry, returns -EUCLEAN. */ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), name, namelen, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); if (location->type != BTRFS_INODE_ITEM_KEY && location->type != BTRFS_ROOT_ITEM_KEY) { ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", __func__, name, btrfs_ino(BTRFS_I(dir)), location->objectid, location->type, location->offset); } if (!ret) *type = btrfs_dir_type(path->nodes[0], di); out: btrfs_free_path(path); return ret; } /* * when we hit a tree root in a directory, the btrfs part of the inode * needs to be changed to reflect the root directory of the tree root. This * is kind of like crossing a mount point. */ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct inode *dir, struct dentry *dentry, struct btrfs_key *location, struct btrfs_root **sub_root) { struct btrfs_path *path; struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; struct btrfs_key key; int ret; int err = 0; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; goto out; } err = -ENOENT; key.objectid = BTRFS_I(dir)->root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret) { if (ret < 0) err = ret; goto out; } leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) goto out; ret = memcmp_extent_buffer(leaf, dentry->d_name.name, (unsigned long)(ref + 1), dentry->d_name.len); if (ret) goto out; btrfs_release_path(path); new_root = btrfs_get_fs_root(fs_info, location->objectid, true); if (IS_ERR(new_root)) { err = PTR_ERR(new_root); goto out; } *sub_root = new_root; location->objectid = btrfs_root_dirid(&new_root->root_item); location->type = BTRFS_INODE_ITEM_KEY; location->offset = 0; err = 0; out: btrfs_free_path(path); return err; } static void inode_tree_add(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_inode *entry; struct rb_node **p; struct rb_node *parent; struct rb_node *new = &BTRFS_I(inode)->rb_node; u64 ino = btrfs_ino(BTRFS_I(inode)); if (inode_unhashed(inode)) return; parent = NULL; spin_lock(&root->inode_lock); p = &root->inode_tree.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_inode, rb_node); if (ino < btrfs_ino(entry)) p = &parent->rb_left; else if (ino > btrfs_ino(entry)) p = &parent->rb_right; else { WARN_ON(!(entry->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); rb_replace_node(parent, new, &root->inode_tree); RB_CLEAR_NODE(parent); spin_unlock(&root->inode_lock); return; } } rb_link_node(new, parent, p); rb_insert_color(new, &root->inode_tree); spin_unlock(&root->inode_lock); } static void inode_tree_del(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; int empty = 0; spin_lock(&root->inode_lock); if (!RB_EMPTY_NODE(&inode->rb_node)) { rb_erase(&inode->rb_node, &root->inode_tree); RB_CLEAR_NODE(&inode->rb_node); empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); if (empty && btrfs_root_refs(&root->root_item) == 0) { spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); spin_unlock(&root->inode_lock); if (empty) btrfs_add_dead_root(root); } } static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; inode->i_ino = args->ino; BTRFS_I(inode)->location.objectid = args->ino; BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.offset = 0; BTRFS_I(inode)->root = btrfs_grab_root(args->root); BUG_ON(args->root && !BTRFS_I(inode)->root); return 0; } static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; return args->ino == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root; } static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; unsigned long hashval = btrfs_inode_hash(ino, root); args.ino = ino; args.root = root; inode = iget5_locked(s, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; } /* * Get an inode object given its inode number and corresponding root. * Path can be preallocated to prevent recursing back to iget through * allocator. NULL is also valid but may require an additional allocation * later. */ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path) { struct inode *inode; inode = btrfs_iget_locked(s, ino, root); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { int ret; ret = btrfs_read_locked_inode(inode, path); if (!ret) { inode_tree_add(inode); unlock_new_inode(inode); } else { iget_failed(inode); /* * ret > 0 can come from btrfs_search_slot called by * btrfs_read_locked_inode, this means the inode item * was not found. */ if (ret > 0) ret = -ENOENT; inode = ERR_PTR(ret); } } return inode; } struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) { return btrfs_iget_path(s, ino, root, NULL); } static struct inode *new_simple_dir(struct super_block *s, struct btrfs_key *key, struct btrfs_root *root) { struct inode *inode = new_inode(s); if (!inode) return ERR_PTR(-ENOMEM); BTRFS_I(inode)->root = btrfs_grab_root(root); memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ inode->i_op = &simple_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; return inode; } static inline u8 btrfs_inode_type(struct inode *inode) { /* * Compile-time asserts that generic FT_* types still match * BTRFS_FT_* types */ BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN); BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE); BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR); BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV); BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV); BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO); BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK); BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK); return fs_umode_to_ftype(inode->i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location = { 0 }; u8 di_type = 0; int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); if (ret < 0) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, location.objectid, root); if (IS_ERR(inode)) return inode; /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", inode->i_mode, btrfs_inode_type(inode), di_type); iput(inode); return ERR_PTR(-EUCLEAN); } return inode; } ret = fixup_tree_root_location(fs_info, dir, dentry, &location, &sub_root); if (ret < 0) { if (ret != -ENOENT) inode = ERR_PTR(ret); else inode = new_simple_dir(dir->i_sb, &location, sub_root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); } if (root != sub_root) btrfs_put_root(sub_root); if (!IS_ERR(inode) && root != sub_root) { down_read(&fs_info->cleanup_work_sem); if (!sb_rdonly(inode->i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { iput(inode); inode = ERR_PTR(ret); } } return inode; } static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; struct inode *inode = d_inode(dentry); if (!inode && !IS_ROOT(dentry)) inode = d_inode(dentry->d_parent); if (inode) { root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0) return 1; if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return 1; } return 0; } static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = btrfs_lookup_dentry(dir, dentry); if (inode == ERR_PTR(-ENOENT)) inode = NULL; return d_splice_alias(inode, dentry); } /* * Find the highest existing sequence number in a directory and then set the * in-memory index_cnt variable to the first free sequence number. */ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; struct btrfs_path *path; struct extent_buffer *leaf; int ret; key.objectid = btrfs_ino(inode); key.type = BTRFS_DIR_INDEX_KEY; key.offset = (u64)-1; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; /* FIXME: we should be able to handle this */ if (ret == 0) goto out; ret = 0; if (path->slots[0] == 0) { inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } inode->index_cnt = found_key.offset + 1; out: btrfs_free_path(path); return ret; } static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) { int ret = 0; btrfs_inode_lock(&dir->vfs_inode, 0); if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) goto out; } } /* index_cnt is the index number of next new entry, so decrement it. */ *index = dir->index_cnt - 1; out: btrfs_inode_unlock(&dir->vfs_inode, 0); return ret; } /* * All this infrastructure exists because dir_emit can fault, and we are holding * the tree lock when doing readdir. For now just allocate a buffer and copy * our information into that, and then dir_emit from the buffer. This is * similar to what NFS does, only we don't keep the buffer around in pagecache * because I'm afraid I'll mess that up. Long term we need to make filldir do * copy_to_user_inatomic so we don't have to worry about page faulting under the * tree lock. */ static int btrfs_opendir(struct inode *inode, struct file *file) { struct btrfs_file_private *private; u64 last_index; int ret; ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index); if (ret) return ret; private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); if (!private) return -ENOMEM; private->last_index = last_index; private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!private->filldir_buf) { kfree(private); return -ENOMEM; } file->private_data = private; return 0; } static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) { struct btrfs_file_private *private = file->private_data; int ret; ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), &private->last_index); if (ret) return ret; return generic_file_llseek(file, offset, whence); } struct dir_entry { u64 ino; u64 offset; unsigned type; int name_len; }; static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) { while (entries--) { struct dir_entry *entry = addr; char *name = (char *)(entry + 1); ctx->pos = get_unaligned(&entry->offset); if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), get_unaligned(&entry->ino), get_unaligned(&entry->type))) return 1; addr += sizeof(struct dir_entry) + get_unaligned(&entry->name_len); ctx->pos++; } return 0; } static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_file_private *private = file->private_data; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; void *addr; struct list_head ins_list; struct list_head del_list; int ret; struct extent_buffer *leaf; int slot; char *name_ptr; int name_len; int entries = 0; int total_len = 0; bool put = false; struct btrfs_key location; if (!dir_emit_dots(file, ctx)) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; addr = private->filldir_buf; path->reada = READA_FORWARD; INIT_LIST_HEAD(&ins_list); INIT_LIST_HEAD(&del_list); put = btrfs_readdir_get_delayed_items(inode, private->last_index, &ins_list, &del_list); again: key.type = BTRFS_DIR_INDEX_KEY; key.offset = ctx->pos; key.objectid = btrfs_ino(BTRFS_I(inode)); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto err; while (1) { struct dir_entry *entry; leaf = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto err; else if (ret > 0) break; continue; } btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) break; if (found_key.type != BTRFS_DIR_INDEX_KEY) break; if (found_key.offset < ctx->pos) goto next; if (found_key.offset > private->last_index) break; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) goto next; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); name_len = btrfs_dir_name_len(leaf, di); if ((total_len + sizeof(struct dir_entry) + name_len) >= PAGE_SIZE) { btrfs_release_path(path); ret = btrfs_filldir(private->filldir_buf, entries, ctx); if (ret) goto nopos; addr = private->filldir_buf; entries = 0; total_len = 0; goto again; } entry = addr; put_unaligned(name_len, &entry->name_len); name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); put_unaligned(location.objectid, &entry->ino); put_unaligned(found_key.offset, &entry->offset); entries++; addr += sizeof(struct dir_entry) + name_len; total_len += sizeof(struct dir_entry) + name_len; next: path->slots[0]++; } btrfs_release_path(path); ret = btrfs_filldir(private->filldir_buf, entries, ctx); if (ret) goto nopos; ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); if (ret) goto nopos; /* * Stop new entries from being returned after we return the last * entry. * * New directory entries are assigned a strictly increasing * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as * they're returned by readdir. Until we re-use freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. * * This is being careful not to overflow 32bit loff_t unless the * last entry requires it because doing so has broken 32bit apps * in the past. */ if (ctx->pos >= INT_MAX) ctx->pos = LLONG_MAX; else ctx->pos = INT_MAX; nopos: ret = 0; err: if (put) btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); btrfs_free_path(path); return ret; } /* * This is somewhat expensive, updating the tree every time the * inode changes. But, it is most likely to find the inode in cache. * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ static int btrfs_dirty_inode(struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret; if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) return 0; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } btrfs_end_transaction(trans); if (BTRFS_I(inode)->delayed_node) btrfs_balance_delayed_items(fs_info); return ret; } /* * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ static int btrfs_update_time(struct inode *inode, struct timespec64 *now, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; bool dirty = flags & ~S_VERSION; if (btrfs_root_readonly(root)) return -EROFS; if (flags & S_VERSION) dirty |= inode_maybe_inc_iversion(inode, dirty); if (flags & S_CTIME) inode->i_ctime = *now; if (flags & S_MTIME) inode->i_mtime = *now; if (flags & S_ATIME) inode->i_atime = *now; return dirty ? btrfs_dirty_inode(inode) : 0; } /* * helper to find a free sequence number in a given directory. This current * code is very simple, later versions will do smarter things in the btree */ int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) { int ret = 0; if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) return ret; } } *index = dir->index_cnt; dir->index_cnt++; return ret; } static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; args.ino = BTRFS_I(inode)->location.objectid; args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), btrfs_find_actor, &args); } /* * Inherit flags from the parent inode. * * Currently only the compression flags and the cow flags are inherited. */ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { unsigned int flags; if (!dir) return; flags = BTRFS_I(dir)->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } else if (flags & BTRFS_INODE_COMPRESS) { BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; } if (flags & BTRFS_INODE_NODATACOW) { BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; if (S_ISREG(inode->i_mode)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; } btrfs_sync_inode_flags_to_i_flags(inode); } static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct user_namespace *mnt_userns, struct inode *dir, const char *name, int name_len, u64 ref_objectid, u64 objectid, umode_t mode, u64 *index) { struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode; struct btrfs_inode_item *inode_item; struct btrfs_key *location; struct btrfs_path *path; struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; int nitems = name ? 2 : 1; unsigned long ptr; unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); nofs_flag = memalloc_nofs_save(); inode = new_inode(fs_info->sb); memalloc_nofs_restore(nofs_flag); if (!inode) { btrfs_free_path(path); return ERR_PTR(-ENOMEM); } /* * O_TMPFILE, set link count to 0, so that after this point, * we fill in an inode item with the correct link count. */ if (!name) set_nlink(inode, 0); /* * we have to initialize this early, so we can reclaim the inode * number if we fail afterwards in this function. */ inode->i_ino = objectid; if (dir && name) { trace_btrfs_inode_request(dir); ret = btrfs_set_inode_index(BTRFS_I(dir), index); if (ret) { btrfs_free_path(path); iput(inode); return ERR_PTR(ret); } } else if (dir) { *index = 0; } /* * index_cnt is ignored for everything but a dir, * btrfs_set_inode_index_count has an explanation for the magic * number */ BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->dir_index = *index; BTRFS_I(inode)->root = btrfs_grab_root(root); BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; /* * We could have gotten an inode number from somebody who was fsynced * and then removed in this same transaction, so let's just set full * sync since it will be a full sync anyway and this will blow away the * old info in the log. */ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); key[0].objectid = objectid; key[0].type = BTRFS_INODE_ITEM_KEY; key[0].offset = 0; sizes[0] = sizeof(struct btrfs_inode_item); if (name) { /* * Start new inodes with an inode_ref. This is slightly more * efficient for small numbers of hard links since they will * be packed into one item. Extended refs will kick in if we * add more hard links than can fit in the ref item. */ key[1].objectid = objectid; key[1].type = BTRFS_INODE_REF_KEY; key[1].offset = ref_objectid; sizes[1] = name_len + sizeof(*ref); } location = &BTRFS_I(inode)->location; location->objectid = objectid; location->offset = 0; location->type = BTRFS_INODE_ITEM_KEY; ret = btrfs_insert_inode_locked(inode); if (ret < 0) { iput(inode); goto fail; } ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) goto fail_unlock; inode_init_owner(mnt_userns, inode, dir, mode); inode_set_bytes(inode, 0); inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); if (name) { ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_inode_ref); btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); btrfs_set_inode_ref_index(path->nodes[0], ref, *index); ptr = (unsigned long)(ref + 1); write_extent_buffer(path->nodes[0], name, ptr, name_len); } btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } inode_tree_add(inode); trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); btrfs_update_root_times(trans, root); ret = btrfs_inode_inherit_props(trans, inode, dir); if (ret) btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d", btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); return inode; fail_unlock: discard_new_inode(inode); fail: if (dir && name) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); return ERR_PTR(ret); } /* * utility function to add 'inode' into 'parent_inode' with * a give name and a given sequence number. * if 'add_backref' is true, also insert a backref from the * inode to the parent directory. */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index) { int ret = 0; struct btrfs_key key; struct btrfs_root *root = parent_inode->root; u64 ino = btrfs_ino(inode); u64 parent_ino = btrfs_ino(parent_inode); if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { memcpy(&key, &inode->root->root_key, sizeof(key)); } else { key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; } if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, index, name, name_len); } else if (add_backref) { ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, parent_ino, index); } /* Nothing to clean up yet */ if (ret) return ret; ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { btrfs_abort_transaction(trans, ret); return ret; } btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name_len * 2); inode_inc_iversion(&parent_inode->vfs_inode); /* * If we are replaying a log tree, we do not want to update the mtime * and ctime of the parent directory with the current time, since the * log replay procedure is responsible for setting them to their correct * values (the ones it had when the fsync was done). */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { struct timespec64 now = current_time(&parent_inode->vfs_inode); parent_inode->vfs_inode.i_mtime = now; parent_inode->vfs_inode.i_ctime = now; } ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; fail_dir_item: if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { u64 local_index; int err; err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, &local_index, name, name_len); if (err) btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; err = btrfs_del_inode_ref(trans, root, name, name_len, ino, parent_ino, &local_index); if (err) btrfs_abort_transaction(trans, err); } /* Return the original error code */ return ret; } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_inode *inode, int backref, u64 index) { int err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, backref, index); if (err > 0) err = -EEXIST; return err; } static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; int err; u64 objectid; u64 index = 0; /* * 2 for inode item and ref * 2 for dir items * 1 for xattr if selinux is on */ trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) return PTR_ERR(trans); err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; goto out_unlock; } /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see * if the filesystem supports xattrs by looking at the * ops vector. */ inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) goto out_unlock; err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 0, index); if (err) goto out_unlock; btrfs_update_inode(trans, root, BTRFS_I(inode)); d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (err && inode) { inode_dec_link_count(inode); discard_new_inode(inode); } return err; } static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; int err; u64 objectid; u64 index = 0; /* * 2 for inode item and ref * 2 for dir items * 1 for xattr if selinux is on */ trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) return PTR_ERR(trans); err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; goto out_unlock; } /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see * if the filesystem supports xattrs by looking at the * ops vector. */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) goto out_unlock; err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto out_unlock; err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 0, index); if (err) goto out_unlock; d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); if (err && inode) { inode_dec_link_count(inode); discard_new_inode(inode); } btrfs_btree_balance_dirty(fs_info); return err; } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 index; int err; int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; /* * 2 items for inode and inode ref * 2 items for dir items * 1 item for parent inode * 1 item for orphan item deletion if O_TMPFILE */ trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); if (IS_ERR(trans)) { err = PTR_ERR(trans); trans = NULL; goto fail; } /* There are several dir indexes for this inode, clear the cache. */ BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); inode->i_ctime = current_time(inode); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 1, index); if (err) { drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto fail; if (inode->i_nlink == 1) { /* * If new hard link count is 1, it's a file created * with open(2) O_TMPFILE flag. */ err = btrfs_orphan_del(trans, BTRFS_I(inode)); if (err) goto fail; } d_instantiate(dentry, inode); btrfs_log_new_name(trans, old_dentry, NULL, parent); } fail: if (trans) btrfs_end_transaction(trans); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } btrfs_btree_balance_dirty(fs_info); return err; } static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; int err = 0; u64 objectid = 0; u64 index = 0; /* * 2 items for inode and ref * 2 items for dir items * 1 for xattr if selinux is on */ trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) return PTR_ERR(trans); err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_fail; inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, S_IFDIR | mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; goto out_fail; } /* these must be set before we unlock the inode */ inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) goto out_fail; btrfs_i_size_write(BTRFS_I(inode), 0); err = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (err) goto out_fail; err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), dentry->d_name.name, dentry->d_name.len, 0, index); if (err) goto out_fail; d_instantiate_new(dentry, inode); out_fail: btrfs_end_transaction(trans); if (err && inode) { inode_dec_link_count(inode); discard_new_inode(inode); } btrfs_btree_balance_dirty(fs_info); return err; } static noinline int uncompress_inline(struct btrfs_path *path, struct page *page, size_t pg_offset, u64 extent_offset, struct btrfs_file_extent_item *item) { int ret; struct extent_buffer *leaf = path->nodes[0]; char *tmp; size_t max_size; unsigned long inline_size; unsigned long ptr; int compress_type; WARN_ON(pg_offset != 0); compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, btrfs_item_nr(path->slots[0])); tmp = kmalloc(inline_size, GFP_NOFS); if (!tmp) return -ENOMEM; ptr = btrfs_file_extent_inline_start(item); read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, extent_offset, inline_size, max_size); /* * decompression code contains a memset to fill in any space between the end * of the uncompressed data and the end of max_size in case the decompressed * data ends up shorter than ram_bytes. That doesn't cover the hole between * the end of an inline extent and the beginning of the next block, so we * cover that region here. */ if (max_size + pg_offset < PAGE_SIZE) memzero_page(page, pg_offset + max_size, PAGE_SIZE - max_size - pg_offset); kfree(tmp); return ret; } /** * btrfs_get_extent - Lookup the first extent overlapping a range in a file. * @inode: file to search in * @page: page to read extent data into if the extent is inline * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * * This returns the first &struct extent_map which overlaps with the given * range, reading it from the B-tree and caching it if necessary. Note that * there may be more extents which overlap the given range after the returned * extent_map. * * If @page is not NULL and the extent is inline, this also reads the extent * data directly into the page and marks the extent up to date in the io_tree. * * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); int extent_type = -1; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; struct extent_buffer *leaf; struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_io_tree *io_tree = &inode->io_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); else if (em->block_start == EXTENT_MAP_INLINE && page) free_extent_map(em); else goto out; } em = alloc_extent_map(); if (!em) { ret = -ENOMEM; goto out; } em->start = EXTENT_MAP_HOLE; em->orig_start = EXTENT_MAP_HOLE; em->len = (u64)-1; em->block_len = (u64)-1; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } /* Chances are we'll be called again, so go ahead and do readahead */ path->reada = READA_FORWARD; /* * The same explanation in load_free_space_cache applies here as well, * we only read when we're loading the free space cache, and at that * point the commit_root has everything we need. */ if (btrfs_is_free_space_inode(inode)) { path->search_commit_root = 1; path->skip_locking = 1; } ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { goto out; } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; ret = 0; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != objectid || found_key.type != BTRFS_EXTENT_DATA_KEY) { /* * If we backup past the first extent we want to move forward * and see if there is an extent in front of us, otherwise we'll * say there is a hole for our whole search range which can * cause problems. */ extent_end = start; goto next; } extent_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; extent_end = btrfs_file_extent_end(path); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); goto out; } trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, extent_start); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, path->slots[0], extent_start); } next: if (start >= extent_end) { path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; else if (ret > 0) goto not_found; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != objectid || found_key.type != BTRFS_EXTENT_DATA_KEY) goto not_found; if (start + len <= found_key.offset) goto not_found; if (start > found_key.offset) goto next; /* New extent overlaps with existing one */ em->start = start; em->orig_start = start; em->len = found_key.offset - start; em->block_start = EXTENT_MAP_HOLE; goto insert; } btrfs_extent_item_to_extent_map(inode, path, item, !page, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { unsigned long ptr; char *map; size_t size; size_t extent_offset; size_t copy_size; if (!page) goto out; size = btrfs_file_extent_ram_bytes(leaf, item); extent_offset = page_offset(page) + pg_offset - extent_start; copy_size = min_t(u64, PAGE_SIZE - pg_offset, size - extent_offset); em->start = extent_start + extent_offset; em->len = ALIGN(copy_size, fs_info->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, extent_offset, item); if (ret) goto out; } else { map = kmap_local_page(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); if (pg_offset + copy_size < PAGE_SIZE) { memset(map + pg_offset + copy_size, 0, PAGE_SIZE - pg_offset - copy_size); } kunmap_local(map); } flush_dcache_page(page); } set_extent_uptodate(io_tree, em->start, extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; } not_found: em->start = start; em->orig_start = start; em->len = len; em->block_start = EXTENT_MAP_HOLE; insert: ret = 0; btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); ret = -EIO; goto out; } write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); if (ret) { free_extent_map(em); return ERR_PTR(ret); } return em; } struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len) { struct extent_map *em; struct extent_map *hole_em = NULL; u64 delalloc_start = start; u64 end; u64 delalloc_len; u64 delalloc_end; int err = 0; em = btrfs_get_extent(inode, NULL, 0, start, len); if (IS_ERR(em)) return em; /* * If our em maps to: * - a hole or * - a pre-alloc extent, * there might actually be delalloc bytes behind it. */ if (em->block_start != EXTENT_MAP_HOLE && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) return em; else hole_em = em; /* check to see if we've wrapped (len == -1 or similar) */ end = start + len; if (end < start) end = (u64)-1; else end -= 1; em = NULL; /* ok, we didn't find anything, lets look for delalloc */ delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start, end, len, EXTENT_DELALLOC, 1); delalloc_end = delalloc_start + delalloc_len; if (delalloc_end < delalloc_start) delalloc_end = (u64)-1; /* * We didn't find anything useful, return the original results from * get_extent() */ if (delalloc_start > end || delalloc_end <= start) { em = hole_em; hole_em = NULL; goto out; } /* * Adjust the delalloc_start to make sure it doesn't go backwards from * the start they passed in */ delalloc_start = max(start, delalloc_start); delalloc_len = delalloc_end - delalloc_start; if (delalloc_len > 0) { u64 hole_start; u64 hole_len; const u64 hole_end = extent_map_end(hole_em); em = alloc_extent_map(); if (!em) { err = -ENOMEM; goto out; } ASSERT(hole_em); /* * When btrfs_get_extent can't find anything it returns one * huge hole * * Make sure what it found really fits our range, and adjust to * make sure it is based on the start from the caller */ if (hole_end <= start || hole_em->start > end) { free_extent_map(hole_em); hole_em = NULL; } else { hole_start = max(hole_em->start, start); hole_len = hole_end - hole_start; } if (hole_em && delalloc_start > hole_start) { /* * Our hole starts before our delalloc, so we have to * return just the parts of the hole that go until the * delalloc starts */ em->len = min(hole_len, delalloc_start - hole_start); em->start = hole_start; em->orig_start = hole_start; /* * Don't adjust block start at all, it is fixed at * EXTENT_MAP_HOLE */ em->block_start = hole_em->block_start; em->block_len = hole_len; if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { /* * Hole is out of passed range or it starts after * delalloc range */ em->start = delalloc_start; em->len = delalloc_len; em->orig_start = delalloc_start; em->block_start = EXTENT_MAP_DELALLOC; em->block_len = delalloc_len; } } else { return hole_em; } out: free_extent_map(hole_em); if (err) { free_extent_map(em); return ERR_PTR(err); } return em; } static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, const u64 start, const u64 len, const u64 orig_start, const u64 block_start, const u64 block_len, const u64 orig_block_len, const u64 ram_bytes, const int type) { struct extent_map *em = NULL; int ret; if (type != BTRFS_ORDERED_NOCOW) { em = create_io_em(inode, start, len, orig_start, block_start, block_len, orig_block_len, ram_bytes, BTRFS_COMPRESS_NONE, /* compress_type */ type); if (IS_ERR(em)) goto out; } ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, block_len, type); if (ret) { if (em) { free_extent_map(em); btrfs_drop_extent_cache(inode, start, start + len - 1, 0); } em = ERR_PTR(ret); } out: return em; } static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, u64 start, u64 len) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); em = btrfs_create_dio_extent(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); return em; } static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *block_group; bool readonly = false; block_group = btrfs_lookup_block_group(fs_info, bytenr); if (!block_group || block_group->ro) readonly = true; if (block_group) btrfs_put_block_group(block_group); return readonly; } /* * Check if we can do nocow write into the range [@offset, @offset + @len) * * @offset: File offset * @len: The length to write, will be updated to the nocow writeable * range * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent * @strict: if true, omit optimizations that might force us into unnecessary * cow. e.g., don't trust generation number. * * Return: * >0 and update @len if we can do nocow write * 0 if we can't do nocow write * <0 if error happened * * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_path *path; int ret; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 disk_bytenr; u64 backref_offset; u64 extent_end; u64 num_bytes; int slot; int found_type; bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(inode)), offset, 0); if (ret < 0) goto out; slot = path->slots[0]; if (ret == 1) { if (slot == 0) { /* can't find the item, must cow */ ret = 0; goto out; } slot--; } ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != btrfs_ino(BTRFS_I(inode)) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ goto out; } if (key.offset > offset) { /* Wrong offset, must cow */ goto out; } fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); if (found_type != BTRFS_FILE_EXTENT_REG && found_type != BTRFS_FILE_EXTENT_PREALLOC) { /* not a regular extent, must cow */ goto out; } if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) goto out; extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if (extent_end <= offset) goto out; disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); if (disk_bytenr == 0) goto out; if (btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out; /* * Do the same check as in btrfs_cross_ref_exist but without the * unnecessary search. */ if (!strict && (btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item))) goto out; backref_offset = btrfs_file_extent_offset(leaf, fi); if (orig_start) { *orig_start = key.offset - backref_offset; *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out; num_bytes = min(offset + *len, extent_end) - offset; if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; range_end = round_up(offset + num_bytes, root->fs_info->sectorsize) - 1; ret = test_range_bit(io_tree, offset, range_end, EXTENT_DELALLOC, 0, NULL); if (ret) { ret = -EAGAIN; goto out; } } btrfs_release_path(path); /* * look for other files referencing this extent, if we * find any we must cow */ ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), key.offset - backref_offset, disk_bytenr, strict); if (ret) { ret = 0; goto out; } /* * adjust disk_bytenr and num_bytes to cover just the bytes * in this extent we are about to write. If there * are any csums in that range we have to cow in order * to keep the csums correct */ disk_bytenr += backref_offset; disk_bytenr += offset - key.offset; if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) goto out; /* * all of the above have passed, it is safe to overwrite this extent * without cow */ *len = num_bytes; ret = 1; out: btrfs_free_path(path); return ret; } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct extent_state **cached_state, bool writing) { struct btrfs_ordered_extent *ordered; int ret = 0; while (1) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, lockend - lockstart + 1); /* * We need to make sure there are no buffered pages in this * range either, we could have raced between the invalidate in * generic_file_direct_write and locking the extent. The * invalidate needs to happen so that reads after a write do not * get stale data. */ if (!ordered && (!writing || !filemap_range_has_page(inode->i_mapping, lockstart, lockend))) break; unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); if (ordered) { /* * If we are doing a DIO read and the ordered extent we * found is for a buffered write, we can not wait for it * to complete and retry, because if we do so we can * deadlock with concurrent buffered writes on page * locks. This happens only if our DIO read covers more * than one extent map, if at this point has already * created an ordered extent for a previous extent map * and locked its range in the inode's io tree, and a * concurrent write against that previous extent map's * range and this range started (we unlock the ranges * in the io tree only when the bios complete and * buffered writes always lock pages before attempting * to lock range in the io tree). */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) btrfs_start_ordered_extent(ordered, 1); else ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); } else { /* * We could trigger writeback for this range (and wait * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page. */ ret = -ENOTBLK; } if (ret) break; cond_resched(); } return ret; } /* The callers of this must take lock_extent() */ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type) { struct extent_map_tree *em_tree; struct extent_map *em; int ret; ASSERT(type == BTRFS_ORDERED_PREALLOC || type == BTRFS_ORDERED_COMPRESSED || type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); em_tree = &inode->extent_tree; em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); em->start = start; em->orig_start = orig_start; em->len = len; em->block_len = block_len; em->block_start = block_start; em->orig_block_len = orig_block_len; em->ram_bytes = ram_bytes; em->generation = -1; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) { set_bit(EXTENT_FLAG_FILLING, &em->flags); } else if (type == BTRFS_ORDERED_COMPRESSED) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); em->compress_type = compress_type; } do { btrfs_drop_extent_cache(inode, em->start, em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); /* * The caller has taken lock_extent(), who could race with us * to add em? */ } while (ret == -EEXIST); if (ret) { free_extent_map(em); return ERR_PTR(ret); } /* em got 2 refs now, callers needs to do free_extent_map once. */ return em; } static int btrfs_get_blocks_direct_write(struct extent_map **map, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = *map; int type; u64 block_start, orig_start, orig_block_len, ram_bytes; bool can_nocow = false; bool space_reserved = false; u64 prev_len; int ret = 0; /* * We don't allocate a new extent in the following cases * * 1) The inode is marked as NODATACOW. In this case we'll just use the * existing extent. * 2) The extent is marked as PREALLOC. We're good to go here and can * just use the extent. * */ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) type = BTRFS_ORDERED_PREALLOC; else type = BTRFS_ORDERED_NOCOW; len = min(len, em->len - (start - em->start)); block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes, false) == 1 && btrfs_inc_nocow_writers(fs_info, block_start)) can_nocow = true; } prev_len = len; if (can_nocow) { struct extent_map *em2; /* We can NOCOW, so only need to reserve metadata space. */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); if (ret < 0) { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; btrfs_dec_nocow_writers(fs_info, block_start); goto out; } space_reserved = true; em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, orig_start, block_start, len, orig_block_len, ram_bytes, type); btrfs_dec_nocow_writers(fs_info, block_start); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); *map = em = em2; } if (IS_ERR(em2)) { ret = PTR_ERR(em2); goto out; } } else { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; /* We have to COW, so need to reserve metadata and data space. */ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &dio_data->data_reserved, start, len); if (ret < 0) goto out; space_reserved = true; em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } *map = em; len = min(len, em->len - (start - em->start)); if (len < prev_len) btrfs_delalloc_release_space(BTRFS_I(inode), dio_data->data_reserved, start + len, prev_len - len, true); } /* * We have created our ordered extent, so we can now release our reservation * for an outstanding extent. */ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); out: if (ret && space_reserved) { btrfs_delalloc_release_extents(BTRFS_I(inode), len); if (can_nocow) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); } else { btrfs_delalloc_release_space(BTRFS_I(inode), dio_data->data_reserved, start, len, true); extent_changeset_free(dio_data->data_reserved); dio_data->data_reserved = NULL; } } return ret; } static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; u64 lockstart, lockend; const bool write = !!(flags & IOMAP_WRITE); int ret = 0; u64 len = length; bool unlock_extents = false; if (!write) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; /* * The generic stuff only does filemap_write_and_wait_range, which * isn't enough if we've written compressed pages to this area, so we * need to flush the dirty pages again to make absolutely sure that any * outstanding dirty pages are on disk. */ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) { ret = filemap_fdatawrite_range(inode->i_mapping, start, start + length - 1); if (ret) return ret; } dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); if (!dio_data) return -ENOMEM; iomap->private = dio_data; /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { ret = -ENOTBLK; goto err; } em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; } /* * Ok for INLINE and COMPRESSED extents we need to fallback on buffered * io. INLINE is special, and we could probably kludge it in here, but * it's still buffered so for safety lets just fall back to the generic * buffered path. * * For COMPRESSED we _have_ to read the entire extent in so we can * decompress it, so there will be buffering required no matter what we * do, so go ahead and fallback to buffered. * * We return -ENOTBLK because that's what makes DIO go ahead and go back * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); /* * If we are in a NOWAIT context, return -EAGAIN in order to * fallback to buffered IO. This is not only because we can * block with buffered IO (no support for NOWAIT semantics at * the moment) but also to avoid returning short reads to user * space - this happens if we were able to read some data from * previous non-compressed extents and then when we fallback to * buffered IO, at btrfs_file_read_iter() by calling * filemap_read(), we fail to fault in pages for the read buffer, * in which case filemap_read() returns a short read (the number * of bytes previously read is > 0, so it does not return -EFAULT). */ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err; } len = min(len, em->len - (start - em->start)); /* * If we have a NOWAIT request and the range contains multiple extents * (or a mix of extents and holes), then we return -EAGAIN to make the * caller fallback to a context where it can do a blocking (without * NOWAIT) request. This way we avoid doing partial IO and returning * success to the caller, which is not optimal for writes and for reads * it can result in unexpected behaviour for an application. * * When doing a read, because we use IOMAP_DIO_PARTIAL when calling * iomap_dio_rw(), we can end up returning less data then what the caller * asked for, resulting in an unexpected, and incorrect, short read. * That is, the caller asked to read N bytes and we return less than that, * which is wrong unless we are crossing EOF. This happens if we get a * page fault error when trying to fault in pages for the buffer that is * associated to the struct iov_iter passed to iomap_dio_rw(), and we * have previously submitted bios for other extents in the range, in * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of * those bios have completed by the time we get the page fault error, * which we return back to our caller - we should only return EIOCBQUEUED * after we have submitted bios for all the extents in the range. */ if ((flags & IOMAP_NOWAIT) && len < length) { free_extent_map(em); ret = -EAGAIN; goto unlock_err; } if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, start, len); if (ret < 0) goto unlock_err; unlock_extents = true; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); } else { /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ lockstart = start + len; if (lockstart < lockend) unlock_extents = true; } if (unlock_extents) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); else free_extent_state(cached_state); /* * Translate extent map information to iomap. * We trim the extents (and move the addr) even though iomap code does * that, since we have locked only the parts we are performing I/O in. */ if ((em->block_start == EXTENT_MAP_HOLE) || (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { iomap->addr = em->block_start + (start - em->start); iomap->type = IOMAP_MAPPED; } iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) iomap->flags |= IOMAP_F_ZONE_APPEND; free_extent_map(em); return 0; unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: kfree(dio_data); return ret; } static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { int ret = 0; struct btrfs_dio_data *dio_data = iomap->private; size_t submitted = dio_data->submitted; const bool write = !!(flags & IOMAP_WRITE); if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); goto out; } if (submitted < length) { pos += submitted; length -= submitted; if (write) __endio_write_update_ordered(BTRFS_I(inode), pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); ret = -ENOTBLK; } if (write) extent_changeset_free(dio_data->data_reserved); out: kfree(dio_data); iomap->private = NULL; return ret; } static void btrfs_dio_private_put(struct btrfs_dio_private *dip) { /* * This implies a barrier so that stores to dio_bio->bi_status before * this and loads of dio_bio->bi_status after this are fully ordered. */ if (!refcount_dec_and_test(&dip->refs)) return; if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { __endio_write_update_ordered(BTRFS_I(dip->inode), dip->logical_offset, dip->bytes, !dip->dio_bio->bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); } bio_endio(dip->dio_bio); kfree(dip); } static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_dio_private *dip = bio->bi_private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); blk_status_t ret; BUG_ON(bio_op(bio) == REQ_OP_WRITE); ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); if (ret) return ret; refcount_inc(&dip->refs); ret = btrfs_map_bio(fs_info, bio, mirror_num); if (ret) refcount_dec(&dip->refs); return ret; } static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, struct btrfs_io_bio *io_bio, const bool uptodate) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; const u32 sectorsize = fs_info->sectorsize; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; u64 start = io_bio->logical; u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { unsigned int i, nr_sectors, pgoff; nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); pgoff = bvec.bv_offset; for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); if (uptodate && (!csum || !check_data_csum(inode, io_bio, bio_offset, bvec.bv_page, pgoff, start))) { clean_io_failure(fs_info, failure_tree, io_tree, start, bvec.bv_page, btrfs_ino(BTRFS_I(inode)), pgoff); } else { int ret; ASSERT((start - io_bio->logical) < UINT_MAX); ret = btrfs_repair_one_sector(inode, &io_bio->bio, start - io_bio->logical, bvec.bv_page, pgoff, start, io_bio->mirror_num, submit_dio_repair_bio); if (ret) err = errno_to_blk_status(ret); } start += sectorsize; ASSERT(bio_offset + sectorsize > bio_offset); bio_offset += sectorsize; pgoff += sectorsize; } } return err; } static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate) { btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, finish_ordered_fn, uptodate); } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, u64 dio_file_offset) { return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1); } static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; blk_status_t err = bio->bi_status; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); if (bio_op(bio) == REQ_OP_READ) { err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio), !err); } if (err) dip->dio_bio->bi_status = err; btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); } static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; /* Check btrfs_submit_bio_hook() for rules about async submit. */ if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); if (!write) { ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); if (ret) goto err; } if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) goto map; if (write && async_submit) { ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, btrfs_submit_bio_start_direct_io); goto err; } else if (write) { /* * If we aren't doing async submit, calculate the csum of the * bio now. */ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); if (ret) goto err; } else { u64 csum_offset; csum_offset = file_offset - dip->logical_offset; csum_offset >>= fs_info->sectorsize_bits; csum_offset *= fs_info->csum_size; btrfs_io_bio(bio)->csum = dip->csums + csum_offset; } map: ret = btrfs_map_bio(fs_info, bio, 0); err: return ret; } /* * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked * or ordered extents whether or not we submit any bios. */ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, struct inode *inode, loff_t file_offset) { const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); size_t dip_size; struct btrfs_dio_private *dip; dip_size = sizeof(*dip); if (!write && csum) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); size_t nblocks; nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; dip_size += fs_info->csum_size * nblocks; } dip = kzalloc(dip_size, GFP_NOFS); if (!dip) return NULL; dip->inode = inode; dip->logical_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); return dip; } static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { struct inode *inode = iter->inode; const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); struct btrfs_dio_private *dip; struct bio *bio; u64 start_sector; int async_submit = 0; u64 submit_len; u64 clone_offset = 0; u64 clone_len; u64 logical; int ret; blk_status_t status; struct btrfs_io_geometry geom; struct btrfs_dio_data *dio_data = iter->iomap.private; struct extent_map *em = NULL; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { if (!write) { unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); } dio_bio->bi_status = BLK_STS_RESOURCE; bio_endio(dio_bio); return BLK_QC_T_NONE; } if (!write) { /* * Load the csums up front to reduce csum tree searches and * contention when submitting bios. * * If we have csums disabled this will do nothing. */ status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); if (status != BLK_STS_OK) goto out_err; } start_sector = dio_bio->bi_iter.bi_sector; submit_len = dio_bio->bi_iter.bi_size; do { logical = start_sector << 9; em = btrfs_get_chunk_map(fs_info, logical, submit_len); if (IS_ERR(em)) { status = errno_to_blk_status(PTR_ERR(em)); em = NULL; goto out_err_em; } ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), logical, &geom); if (ret) { status = errno_to_blk_status(ret); goto out_err_em; } clone_len = min(submit_len, geom.len); ASSERT(clone_len <= UINT_MAX); /* * This will never fail as it's passing GPF_NOFS and * the allocation is backed by btrfs_bioset. */ bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); if (status) { bio_put(bio); goto out_err; } } ASSERT(submit_len >= clone_len); submit_len -= clone_len; /* * Increase the count before we submit the bio so we know * the end IO handler won't happen before we increase the * count. Otherwise, the dip might get freed before we're * done setting it up. * * We transfer the initial reference to the last bio, so we * don't need to increment the reference count for the last one. */ if (submit_len > 0) { refcount_inc(&dip->refs); /* * If we are submitting more than one bio, submit them * all asynchronously. The exception is RAID 5 or 6, as * asynchronous checksums make it difficult to collect * full stripe writes. */ if (!raid56) async_submit = 1; } status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); if (status) { bio_put(bio); if (submit_len > 0) refcount_dec(&dip->refs); goto out_err_em; } dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; free_extent_map(em); } while (submit_len > 0); return BLK_QC_T_NONE; out_err_em: free_extent_map(em); out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); return BLK_QC_T_NONE; } const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, }; const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, }; static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret; ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret; return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; struct btrfs_bio_ctrl bio_ctrl = { 0 }; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); if (bio_ctrl.bio) ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); return ret; } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; int ret; if (current->flags & PF_MEMALLOC) { redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } /* * If we are under memory pressure we will call this directly from the * VM, we need to make sure we have the inode referenced for the ordered * extent. If not just return like we didn't do anything. */ if (!igrab(inode)) { redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } ret = extent_write_full_page(page, wbc); btrfs_add_delayed_iput(inode); return ret; } static int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { return extent_writepages(mapping, wbc); } static void btrfs_readahead(struct readahead_control *rac) { extent_readahead(rac); } /* * For releasepage() and invalidatepage() we have a race window where * end_page_writeback() is called but the subpage spinlock is not yet released. * If we continue to release/invalidate the page, we could cause use-after-free * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ static void wait_subpage_spinlock(struct page *page) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); struct btrfs_subpage *subpage; if (fs_info->sectorsize == PAGE_SIZE) return; ASSERT(PagePrivate(page) && page->private); subpage = (struct btrfs_subpage *)page->private; /* * This may look insane as we just acquire the spinlock and release it, * without doing anything. But we just want to make sure no one is * still holding the subpage spinlock. * And since the page is not dirty nor writeback, and we have page * locked, the only possible way to hold a spinlock is from the endio * function to clear page writeback. * * Here we just acquire the spinlock so that all existing callers * should exit and we're safe to release/invalidate the page. */ spin_lock_irq(&subpage->lock); spin_unlock_irq(&subpage->lock); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); if (ret == 1) { wait_subpage_spinlock(page); clear_page_extent_mapped(page); } return ret; } static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) return 0; return __btrfs_releasepage(page, gfp_flags); } #ifdef CONFIG_MIGRATION static int btrfs_migratepage(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { int ret; ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; if (page_has_private(page)) attach_page_private(newpage, detach_page_private(page)); if (PageOrdered(page)) { ClearPageOrdered(page); SetPageOrdered(newpage); } if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } #endif static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* * We have page locked so no new ordered extent can be created on this * page, nor bio can be submitted for this page. * * But already submitted bio can still be finished on this page. * Furthermore, endio function won't skip page which has Ordered * (Private2) already cleared, so it's possible for endio and * invalidatepage to do the same ordered extent accounting twice * on one page. * * So here we wait for any submitted bios to finish, so that we won't * do double ordered extent accounting on the same page. */ wait_on_page_writeback(page); wait_subpage_spinlock(page); /* * For subpage case, we have call sites like * btrfs_punch_hole_lock_range() which passes range not aligned to * sectorsize. * If the range doesn't cover the full page, we don't need to and * shouldn't clear page extent mapped, as page->private can still * record subpage dirty bits for other part of the range. * * For cases that can invalidate the full even the range doesn't * cover the full page, like invalidating the last page, we're * still safe to wait for ordered extent to finish. */ if (!(offset == 0 && length == PAGE_SIZE)) { btrfs_releasepage(page, GFP_NOFS); return; } if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { struct btrfs_ordered_extent *ordered; bool delete_states; u64 range_end; u32 range_len; ordered = btrfs_lookup_first_ordered_range(inode, cur, page_end + 1 - cur); if (!ordered) { range_end = page_end; /* * No ordered extent covering this range, we are safe * to delete all extent states in the range. */ delete_states = true; goto next; } if (ordered->file_offset > cur) { /* * There is a range between [cur, oe->file_offset) not * covered by any ordered extent. * We are safe to delete all extent states, and handle * the ordered extent in the next iteration. */ range_end = ordered->file_offset - 1; delete_states = true; goto next; } range_end = min(ordered->file_offset + ordered->num_bytes - 1, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. */ delete_states = false; goto next; } btrfs_page_clear_ordered(fs_info, page, cur, range_len); /* * IO on this page will never be started, so we need to account * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW * here, must leave that up for the ordered extent completion. * * This will also unlock the range for incoming * btrfs_finish_ordered_io(). */ if (!inode_evicting) clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); spin_lock_irq(&inode->ordered_tree.lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); spin_unlock_irq(&inode->ordered_tree.lock); if (btrfs_dec_test_ordered_pending(inode, &ordered, cur, range_end + 1 - cur)) { btrfs_finish_ordered_io(ordered); /* * The ordered extent has finished, now we're again * safe to delete all extent states of the range. */ delete_states = true; } else { /* * btrfs_finish_ordered_io() will get executed by endio * of other pages, thus we can't delete extent states * anymore */ delete_states = false; } next: if (ordered) btrfs_put_ordered_extent(ordered); /* * Qgroup reserved space handler * Sector(s) here will be either: * * 1) Already written to disk or bio already finished * Then its QGROUP_RESERVED bit in io_tree is already cleared. * Qgroup will be handled by its qgroup_record then. * btrfs_qgroup_free_data() call will do nothing here. * * 2) Not written to disk yet * Then btrfs_qgroup_free_data() call will clear the * QGROUP_RESERVED bit of its io_tree, and free the qgroup * reserved data space. * Since the IO will never happen for this page. */ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, delete_states, &cached_state); } cur = range_end + 1; } /* * We have iterated through all ordered extents of the page, the page * should not have Ordered (Private2) anymore, or the above iteration * did something wrong. */ ASSERT(!PageOrdered(page)); if (!inode_evicting) __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); clear_page_extent_mapped(page); } /* * btrfs_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must * be careful to check for EOF conditions here. We set the page up correctly * for a written page which means we get ENOSPC checking when writing into * holes and correct delalloc and unwritten extent mapping on filesystems that * support these features. * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because * truncate_setsize() writes the inode size before removing pages, once we have * the page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; vm_fault_t ret; int ret2; int reserved = 0; u64 reserved_space; u64 page_start; u64 page_end; u64 end; reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; end = page_end; /* * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering * dirty page write out, then the btrfs_writepage() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, page_start, reserved_space); if (!ret2) { ret2 = file_update_time(vmf->vma->vm_file); reserved = 1; } if (ret2) { ret = vmf_error(ret2); if (reserved) goto out; goto out_noreserve; } ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: down_read(&BTRFS_I(inode)->i_mmap_lock); lock_page(page); size = i_size_read(inode); if ((page->mapping != inode->i_mapping) || (page_start >= size)) { /* page got truncated out from underneath us */ goto out_unlock; } wait_on_page_writeback(page); lock_extent_bits(io_tree, page_start, page_end, &cached_state); ret2 = set_page_extent_mapped(page); if (ret2 < 0) { ret = vmf_error(ret2); unlock_extent_cached(io_tree, page_start, page_end, &cached_state); goto out_unlock; } /* * we can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish */ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } if (page->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, PAGE_SIZE - reserved_space, true); } } /* * page_mkwrite gets called when the page is firstly dirtied after it's * faulted in, but write(2) could also dirty a page and set delalloc * bits, thus in this case for space account reason, we still need to * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments). */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); if (ret2) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state); ret = VM_FAULT_SIGBUS; goto out_unlock; } /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) zero_start = offset_in_page(size); else zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) { memzero_page(page, zero_start, PAGE_SIZE - zero_start); flush_dcache_page(page); } ClearPageChecked(page); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); unlock_extent_cached(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; out_unlock: unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return ret; } static int btrfs_truncate(struct inode *inode, bool skip_writeback) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_metadata_size(fs_info, 1); u64 extents_found = 0; if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); if (ret) return ret; } /* * Yes ladies and gentlemen, this is indeed ugly. We have a couple of * things going on here: * * 1) We need to reserve space to update our inode. * * 2) We need to have something to cache all the space that is going to * be free'd up by the truncate operation, but also have some slack * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * * And we need these to be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space * we will use, so we need the truncate reservation to be separate so it * doesn't end up using space reserved for updating the inode. We also * need to be able to stop the transaction and start a new one, which * means we need to be able to update the inode several times, and we * have no idea of knowing how many times that will be, so we can't just * reserve 1 item for the entirety of the operation, so that has to be * done separately as well. * * So that leaves us with * * 1) rsv - for the truncate reservation, which we will steal from the * transaction reservation. * 2) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) return -ENOMEM; rsv->size = min_size; rsv->failfast = 1; /* * 1 for the truncate slack space * 1 for updating the inode. */ trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); BUG_ON(ret); trans->block_rsv = rsv; while (1) { ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), inode->i_size, BTRFS_EXTENT_DATA_KEY, &extents_found); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) break; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; break; } btrfs_block_rsv_release(fs_info, rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; } /* * We can't call btrfs_truncate_block inside a trans handle as we could * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know * we've truncated everything except the last little bit, and can do * btrfs_truncate_block and then update the disk_i_size. */ if (ret == NEED_TRUNCATE_BLOCK) { btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); if (ret) goto out; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } if (trans) { int ret2; trans->block_rsv = &fs_info->trans_block_rsv; ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret2 && !ret) ret = ret2; ret2 = btrfs_end_transaction(trans); if (ret2 && !ret) ret = ret2; btrfs_btree_balance_dirty(fs_info); } out: btrfs_free_block_rsv(fs_info, rsv); /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to * first truncate that entire inode. So set this flag so we write out * all of the extents in the inode to the sync log so we're completely * safe. * * If no extents were dropped or trimmed we don't need to force the next * fsync to truncate all the inode's items from the log and re-log them * all. This means the truncate operation did not change the file size, * or changed it to a smaller size but there was only an implicit hole * between the old i_size and the new i_size, and there were no prealloc * extents beyond i_size to drop. */ if (extents_found > 0) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); return ret; } /* * create a new subvolume directory/inode (helper for the ioctl). */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, struct user_namespace *mnt_userns) { struct inode *inode; int err; u64 index = 0; u64 ino; err = btrfs_get_free_objectid(new_root, &ino); if (err < 0) return err; inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, ino, ino, S_IFDIR | (~current_umask() & S_IRWXUGO), &index); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; set_nlink(inode, 1); btrfs_i_size_write(BTRFS_I(inode), 0); unlock_new_inode(inode); err = btrfs_subvol_inherit_props(trans, new_root, parent_root); if (err) btrfs_err(new_root->fs_info, "error inheriting subvolume %llu properties: %d", new_root->root_key.objectid, err); err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); iput(inode); return err; } struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->root = NULL; ei->generation = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; ei->ro_flags = 0; ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_reflink_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, BTRFS_BLOCK_RSV_DELALLOC); ei->runtime_flags = 0; ei->prop_compress = BTRFS_COMPRESS_NONE; ei->defrag_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; ei->i_otime.tv_sec = 0; ei->i_otime.tv_nsec = 0; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); extent_io_tree_init(fs_info, &ei->io_failure_tree, IO_TREE_INODE_IO_FAILURE, inode); extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT, inode); ei->io_tree.track_uptodate = true; ei->io_failure_tree.track_uptodate = true; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); init_rwsem(&ei->i_mmap_lock); return inode; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode) { btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif void btrfs_free_inode(struct inode *inode) { kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } void btrfs_destroy_inode(struct inode *vfs_inode) { struct btrfs_ordered_extent *ordered; struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_root *root = inode->root; WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); WARN_ON(vfs_inode->i_data.nrpages); WARN_ON(inode->block_rsv.reserved); WARN_ON(inode->block_rsv.size); WARN_ON(inode->outstanding_extents); WARN_ON(inode->delalloc_bytes); WARN_ON(inode->new_delalloc_bytes); WARN_ON(inode->csum_bytes); WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also * created the same inode and we need to destroy the one we already * created. */ if (!root) return; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) break; else { btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); } int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; if (root == NULL) return 1; /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0) return 1; else return generic_drop_inode(inode); } static void init_once(void *foo) { struct btrfs_inode *ei = (struct btrfs_inode *) foo; inode_init_once(&ei->vfs_inode); } void __cold btrfs_destroy_cachep(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(btrfs_inode_cachep); kmem_cache_destroy(btrfs_trans_handle_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) goto fail; btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_cachep) goto fail; btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", PAGE_SIZE, PAGE_SIZE, SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_bitmap_cachep) goto fail; return 0; fail: btrfs_destroy_cachep(); return -ENOMEM; } static int btrfs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { u64 delalloc_bytes; u64 inode_bytes; struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; u32 bi_flags = BTRFS_I(inode)->flags; u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; if (bi_flags & BTRFS_INODE_APPEND) stat->attributes |= STATX_ATTR_APPEND; if (bi_flags & BTRFS_INODE_COMPRESS) stat->attributes |= STATX_ATTR_COMPRESSED; if (bi_flags & BTRFS_INODE_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (bi_flags & BTRFS_INODE_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; if (bi_ro_flags & BTRFS_INODE_RO_VERITY) stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); generic_fillattr(mnt_userns, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; inode_bytes = inode_get_bytes(inode); spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_bytes, blocksize) + ALIGN(delalloc_bytes, blocksize)) >> 9; return 0; } static int btrfs_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; bool need_abort = false; /* * For non-subvolumes allow exchange only within one subvolume, in the * same inode namespace. Two subvolumes (represented as directory) can * be exchanged as they're a logical link and have a fixed inode number. */ if (root != dest && (old_ino != BTRFS_FIRST_FREE_OBJECTID || new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem); /* * We want to reserve the absolute worst case amount of items. So if * both inodes are subvols and we need to unlink them then that would * require 4 item modifications, but if they are both normal inodes it * would require 5 item modifications, so we'll assume their normal * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items * should cover the worst case number of items we'll modify. */ trans = btrfs_start_transaction(root, 12); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } if (dest != root) { ret = btrfs_record_root_in_trans(trans, dest); if (ret) goto out_fail; } /* * We need to find a free sequence number both in the source and * in the destination directory for the exchange. */ ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); if (ret) goto out_fail; ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); if (ret) goto out_fail; BTRFS_I(old_inode)->dir_index = 0ULL; BTRFS_I(new_inode)->dir_index = 0ULL; /* Reference for the source. */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); if (ret) goto out_fail; need_abort = true; } /* And now for the dest. */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { ret = btrfs_insert_inode_ref(trans, root, old_dentry->d_name.name, old_dentry->d_name.len, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { if (need_abort) btrfs_abort_transaction(trans, ret); goto out_fail; } } /* Update inode version and ctime/mtime. */ inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; new_inode->i_ctime = ctime; if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), 1); btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), BTRFS_I(new_inode), 1); } /* * Now pin the logs of the roots. We do it to ensure that no other task * can sync the logs while we are in progress with the rename, because * that could result in an inconsistency in case any of the inodes that * are part of this rename operation were logged before. * * We pin the logs even if at this precise moment none of the inodes was * logged before. This is because right after we checked for that, some * other task fsyncing some other inode not involved with this rename * operation could log that one of our inodes exists. * * We don't need to pin the logs before the above calls to * btrfs_insert_inode_ref(), since those don't ever need to change a log. */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { btrfs_pin_log_trans(root); root_log_pinned = true; } if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { btrfs_pin_log_trans(dest); dest_log_pinned = true; } /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_dentry->d_name.name, new_dentry->d_name.len); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_dentry->d_name.name, new_dentry->d_name.len, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), old_dentry->d_name.name, old_dentry->d_name.len, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = old_idx; if (new_inode->i_nlink == 1) BTRFS_I(new_inode)->dir_index = new_idx; if (root_log_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), new_dentry->d_parent); btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), old_dentry->d_parent); btrfs_end_log_trans(dest); dest_log_pinned = false; } out_fail: /* * If we have pinned a log and an error happened, we unpin tasks * trying to sync the log and force them to fallback to a transaction * commit if the log currently contains any of the inodes involved in * this rename operation (to ensure we do not persist a log with an * inconsistent state for any of these inodes or leading to any * inconsistencies when replayed). If the transaction was aborted, the * abortion reason is propagated to userspace when attempting to commit * the transaction. If the log does not contain any of these inodes, we * allow the tasks to sync it. */ if (ret && (root_log_pinned || dest_log_pinned)) { if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) btrfs_set_log_full_commit(trans); if (root_log_pinned) { btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { btrfs_end_log_trans(dest); dest_log_pinned = false; } } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); return ret; } static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry) { int ret; struct inode *inode; u64 objectid; u64 index; ret = btrfs_get_free_objectid(root, &objectid); if (ret) return ret; inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, S_IFCHR | WHITEOUT_MODE, &index); if (IS_ERR(inode)) { ret = PTR_ERR(inode); return ret; } inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); ret = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (ret) goto out; ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 0, index); if (ret) goto out; ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: unlock_new_inode(inode); if (ret) inode_dec_link_count(inode); iput(inode); return ret; } static int btrfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); u64 index = 0; int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) return -ENOTEMPTY; if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; /* check for collisions, even if the name isn't there */ ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, new_dentry->d_name.name, new_dentry->d_name.len); if (ret) { if (ret == -EEXIST) { /* we shouldn't get * eexist without a new_inode */ if (WARN_ON(!new_inode)) { return ret; } } else { /* maybe -EOVERFLOW */ return ret; } } ret = 0; /* * we're using rename to replace one file with another. Start IO on it * now so we don't add too much work to the end of the transaction */ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); /* close the racy window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem); /* * We want to reserve the absolute worst case amount of items. So if * both inodes are subvols and we need to unlink them then that would * require 4 item modifications, but if they are both normal inodes it * would require 5 item modifications, so we'll assume they are normal * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. * If our rename has the whiteout flag, we need more 5 units for the * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item * when selinux is enabled). */ trans_num_items = 11; if (flags & RENAME_WHITEOUT) trans_num_items += 5; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } if (dest != root) { ret = btrfs_record_root_in_trans(trans, dest); if (ret) goto out_fail; } ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); if (ret) goto out_fail; BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, old_ino, btrfs_ino(BTRFS_I(new_dir)), index); if (ret) goto out_fail; } inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = old_inode->i_ctime = current_time(old_dir); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), 1); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { /* * Now pin the log. We do it to ensure that no other task can * sync the log while we are in progress with the rename, as * that could result in an inconsistency in case any of the * inodes that are part of this rename operation were logged * before. * * We pin the log even if at this precise moment none of the * inodes was logged before. This is because right after we * checked for that, some other task fsyncing some other inode * not involved with this rename operation could log that one of * our inodes exists. * * We don't need to pin the logs before the above call to * btrfs_insert_inode_ref(), since that does not need to change * a log. */ btrfs_pin_log_trans(root); log_pinned = true; ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } if (new_inode) { inode_inc_iversion(new_inode); new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), new_dentry->d_name.name, new_dentry->d_name.len); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_dentry->d_name.name, new_dentry->d_name.len, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; if (log_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), new_dentry->d_parent); btrfs_end_log_trans(root); log_pinned = false; } if (flags & RENAME_WHITEOUT) { ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, old_dir, old_dentry); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } } out_fail: /* * If we have pinned the log and an error happened, we unpin tasks * trying to sync the log and force them to fallback to a transaction * commit if the log currently contains any of the inodes involved in * this rename operation (to ensure we do not persist a log with an * inconsistent state for any of these inodes or leading to any * inconsistencies when replayed). If the transaction was aborted, the * abortion reason is propagated to userspace when attempting to commit * the transaction. If the log does not contain any of these inodes, we * allow the tasks to sync it. */ if (ret && log_pinned) { if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || (new_inode && btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) btrfs_set_log_full_commit(trans); btrfs_end_log_trans(root); log_pinned = false; } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); return ret; } static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) return btrfs_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, new_dentry, flags); } struct btrfs_delalloc_work { struct inode *inode; struct completion completion; struct list_head list; struct btrfs_work work; }; static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; struct inode *inode; delalloc_work = container_of(work, struct btrfs_delalloc_work, work); inode = delalloc_work->inode; filemap_flush(inode->i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); iput(inode); complete(&delalloc_work->completion); } static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) { struct btrfs_delalloc_work *work; work = kmalloc(sizeof(*work), GFP_NOFS); if (!work) return NULL; init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); return work; } /* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; struct list_head works; struct list_head splice; int ret = 0; bool full_flush = wbc->nr_to_write == LONG_MAX; INIT_LIST_HEAD(&works); INIT_LIST_HEAD(&splice); mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); list_move_tail(&binode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) continue; inode = igrab(&binode->vfs_inode); if (!inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &binode->runtime_flags); if (full_flush) { work = btrfs_alloc_delalloc_work(inode); if (!work) { iput(inode); ret = -ENOMEM; goto out; } list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; } cond_resched(); spin_lock(&root->delalloc_lock); } spin_unlock(&root->delalloc_lock); out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); wait_for_completion(&work->completion); kfree(work); } if (!list_empty(&splice)) { spin_lock(&root->delalloc_lock); list_splice_tail(&splice, &root->delalloc_inodes); spin_unlock(&root->delalloc_lock); } mutex_unlock(&root->delalloc_mutex); return ret; } int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; struct btrfs_fs_info *fs_info = root->fs_info; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { struct writeback_control wbc = { .nr_to_write = nr, .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; struct btrfs_root *root; struct list_head splice; int ret; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; INIT_LIST_HEAD(&splice); mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { /* * Reset nr_to_write here so we know that we're doing a full * flush. */ if (nr == LONG_MAX) wbc.nr_to_write = LONG_MAX; root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_root(root); BUG_ON(!root); list_move_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); btrfs_put_root(root); if (ret < 0 || wbc.nr_to_write <= 0) goto out; spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); ret = 0; out: if (!list_empty(&splice)) { spin_lock(&fs_info->delalloc_root_lock); list_splice_tail(&splice, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); } mutex_unlock(&fs_info->delalloc_root_mutex); return ret; } static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *symname) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct btrfs_key key; struct inode *inode = NULL; int err; u64 objectid; u64 index = 0; int name_len; int datasize; unsigned long ptr; struct btrfs_file_extent_item *ei; struct extent_buffer *leaf; name_len = strlen(symname); if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return -ENAMETOOLONG; /* * 2 items for inode item and ref * 2 items for dir items * 1 item for updating parent inode item * 1 item for the inline extent item * 1 item for xattr if selinux is on */ trans = btrfs_start_transaction(root, 7); if (IS_ERR(trans)) return PTR_ERR(trans); err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, S_IFLNK | S_IRWXUGO, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; goto out_unlock; } /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see * if the filesystem supports xattrs by looking at the * ops vector. */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) goto out_unlock; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; goto out_unlock; } key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { btrfs_free_path(path); goto out_unlock; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, ei, trans->transid); btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_encryption(leaf, ei, 0); btrfs_set_file_extent_compression(leaf, ei, 0); btrfs_set_file_extent_other_encoding(leaf, ei, 0); btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); inode_set_bytes(inode, name_len); btrfs_i_size_write(BTRFS_I(inode), name_len); err = btrfs_update_inode(trans, root, BTRFS_I(inode)); /* * Last step, add directory indexes for our symlink inode. This is the * last step to avoid extra cleanup of these indexes if an error happens * elsewhere above. */ if (!err) err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), 0, index); if (err) goto out_unlock; d_instantiate_new(dentry, inode); out_unlock: btrfs_end_transaction(trans); if (err && inode) { inode_dec_link_count(inode); discard_new_inode(inode); } btrfs_btree_balance_dirty(fs_info); return err; } static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_trans_handle *trans_in, struct btrfs_inode *inode, struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; struct btrfs_replace_extent_info extent_info; struct btrfs_trans_handle *trans = trans_in; struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; int qgroup_released; int ret; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); if (qgroup_released < 0) return ERR_PTR(qgroup_released); if (trans) { ret = insert_reserved_file_extent(trans, inode, file_offset, &stack_fi, true, qgroup_released); if (ret) goto free_qgroup; return trans; } extent_info.disk_offset = start; extent_info.disk_len = len; extent_info.data_offset = 0; extent_info.data_len = len; extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto free_qgroup; } ret = btrfs_replace_file_extents(inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); if (ret) goto free_qgroup; return trans; free_qgroup: /* * We have released qgroup data range at the beginning of the function, * and normally qgroup_released bytes will be freed when committing * transaction. * But if we error out early, we have to free what we have released * or we leak qgroup data reservation. */ btrfs_qgroup_free_refroot(inode->root->fs_info, inode->root->root_key.objectid, qgroup_released, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(ret); } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; u64 clear_offset = start; u64 i_size; u64 cur_bytes; u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; u64 end = start + num_bytes - 1; if (trans) own_trans = false; while (num_bytes > 0) { cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* * If we are severely fragmented we could end up with really * small allocations, so if the allocator is returning small * chunks lets make its job easier by only searching for those * sized chunks. */ cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) break; /* * We've reserved this space, and thus converted it from * ->bytes_may_use to ->bytes_reserved. Any error that happens * from here on out we will only need to clear our reservation * for the remaining unreserved area, so advance our * clear_offset by our extent size. */ clear_offset += ins.offset; last_alloc = ins.offset; trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), &ins, cur_offset); /* * Now that we inserted the prealloc extent we can finally * decrement the number of reservations in the block group. * If we did it before, we could race with relocation and have * relocation miss the reserved extent, making it fail later. */ btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); break; } btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + ins.offset -1, 0); em = alloc_extent_map(); if (!em) { set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); goto next; } em->start = cur_offset; em->orig_start = cur_offset; em->len = ins.offset; em->block_start = ins.objectid; em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ins.offset; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; while (1) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) break; btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, 0); } free_extent_map(em); next: num_bytes -= ins.offset; cur_offset += ins.offset; *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); inode->i_ctime = current_time(inode); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && (cur_offset > inode->i_size)) { if (cur_offset > actual_len) i_size = actual_len; else i_size = cur_offset; i_size_write(inode, i_size); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans); break; } if (own_trans) { btrfs_end_transaction(trans); trans = NULL; } } if (clear_offset < end) btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, end - clear_offset + 1); return ret; } int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint) { return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, min_size, actual_len, alloc_hint, NULL); } int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint) { return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, min_size, actual_len, alloc_hint, trans); } static int btrfs_set_page_dirty(struct page *page) { return __set_page_dirty_nobuffers(page); } static int btrfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; umode_t mode = inode->i_mode; if (mask & MAY_WRITE && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { if (btrfs_root_readonly(root)) return -EROFS; if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } return generic_permission(mnt_userns, inode, mask); } static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; u64 objectid; u64 index; int ret = 0; /* * 5 units required for adding orphan entry */ trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; goto out; } inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); if (ret) goto out; ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) goto out; ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) goto out; /* * We set number of links to 0 in btrfs_new_inode(), and here we set * it to 1 because d_tmpfile() will issue a warning if the count is 0, * through: * * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); d_tmpfile(dentry, inode); unlock_new_inode(inode); mark_inode_dirty(inode); out: btrfs_end_transaction(trans); if (ret && inode) discard_new_inode(inode); btrfs_btree_balance_dirty(fs_info); return ret; } void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; struct page *page; u32 len; ASSERT(end + 1 - start <= U32_MAX); len = end + 1 - start; while (index <= end_index) { page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ btrfs_page_set_writeback(fs_info, page, start, len); put_page(page); index++; } } #ifdef CONFIG_SWAP /* * Add an entry indicating a block group or device which is pinned by a * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a * negative errno on failure. */ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, bool is_block_group) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_swapfile_pin *sp, *entry; struct rb_node **p; struct rb_node *parent = NULL; sp = kmalloc(sizeof(*sp), GFP_NOFS); if (!sp) return -ENOMEM; sp->ptr = ptr; sp->inode = inode; sp->is_block_group = is_block_group; sp->bg_extent_count = 1; spin_lock(&fs_info->swapfile_pins_lock); p = &fs_info->swapfile_pins.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_swapfile_pin, node); if (sp->ptr < entry->ptr || (sp->ptr == entry->ptr && sp->inode < entry->inode)) { p = &(*p)->rb_left; } else if (sp->ptr > entry->ptr || (sp->ptr == entry->ptr && sp->inode > entry->inode)) { p = &(*p)->rb_right; } else { if (is_block_group) entry->bg_extent_count++; spin_unlock(&fs_info->swapfile_pins_lock); kfree(sp); return 1; } } rb_link_node(&sp->node, parent, p); rb_insert_color(&sp->node, &fs_info->swapfile_pins); spin_unlock(&fs_info->swapfile_pins_lock); return 0; } /* Free all of the entries pinned by this swapfile. */ static void btrfs_free_swapfile_pins(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_swapfile_pin *sp; struct rb_node *node, *next; spin_lock(&fs_info->swapfile_pins_lock); node = rb_first(&fs_info->swapfile_pins); while (node) { next = rb_next(node); sp = rb_entry(node, struct btrfs_swapfile_pin, node); if (sp->inode == inode) { rb_erase(&sp->node, &fs_info->swapfile_pins); if (sp->is_block_group) { btrfs_dec_block_group_swap_extents(sp->ptr, sp->bg_extent_count); btrfs_put_block_group(sp->ptr); } kfree(sp); } node = next; } spin_unlock(&fs_info->swapfile_pins_lock); } struct btrfs_swap_info { u64 start; u64 block_start; u64 block_len; u64 lowest_ppage; u64 highest_ppage; unsigned long nr_pages; int nr_extents; }; static int btrfs_add_swap_extent(struct swap_info_struct *sis, struct btrfs_swap_info *bsi) { unsigned long nr_pages; unsigned long max_pages; u64 first_ppage, first_ppage_reported, next_ppage; int ret; /* * Our swapfile may have had its size extended after the swap header was * written. In that case activating the swapfile should not go beyond * the max size set in the swap header. */ if (bsi->nr_pages >= sis->max) return 0; max_pages = sis->max - bsi->nr_pages; first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, PAGE_SIZE) >> PAGE_SHIFT; if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; nr_pages = min(nr_pages, max_pages); first_ppage_reported = first_ppage; if (bsi->start == 0) first_ppage_reported++; if (bsi->lowest_ppage > first_ppage_reported) bsi->lowest_ppage = first_ppage_reported; if (bsi->highest_ppage < (next_ppage - 1)) bsi->highest_ppage = next_ppage - 1; ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); if (ret < 0) return ret; bsi->nr_extents += ret; bsi->nr_pages += nr_pages; return 0; } static void btrfs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); btrfs_free_swapfile_pins(inode); atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); } static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; struct extent_map *em = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; int ret = 0; u64 isize; u64 start; /* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and * we don't really care. */ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) return ret; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); return -EINVAL; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); return -EINVAL; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); return -EINVAL; } /* * Balance or device remove/replace/resize can move stuff around from * under us. The exclop protection makes sure they aren't running/won't * run concurrently while we are mapping the swap extents, and * fs_info->swapfile_pins prevents them from running while the swap * file is active and moving the extents. Note that this also prevents * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it. */ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; } /* * Prevent snapshot creation while we are activating the swap file. * We do not want to race with snapshot creation. If snapshot creation * already started before we bumped nr_swapfiles from 0 to 1 and * completes before the first write into the swap file after it is * activated, than that write would fallback to COW. */ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); return -EINVAL; } /* * Snapshots can create extents which require COW even if NODATACOW is * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. * * It is possible that subvolume is marked for deletion but still not * removed yet. To prevent this race, we check the root status before * activating the swapfile. */ spin_lock(&root->root_item_lock); if (btrfs_root_dead(root)) { spin_unlock(&root->root_item_lock); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", root->root_key.objectid); return -EPERM; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent_bits(io_tree, 0, isize - 1, &cached_state); start = 0; while (start < isize) { u64 logical_block_start, physical_block_start; struct btrfs_block_group *bg; u64 len = isize - start; em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } if (em->block_start == EXTENT_MAP_HOLE) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } if (em->block_start == EXTENT_MAP_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be * big enough to store more than the swap header, but in * case something changes in the future, let's catch it * here rather than later. */ btrfs_warn(fs_info, "swapfile must not be inline"); ret = -EINVAL; goto out; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } logical_block_start = em->block_start + (start - em->start); len = min(len, em->len - (start - em->start)); free_extent_map(em); em = NULL; ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); if (ret < 0) { goto out; } else if (ret) { ret = 0; } else { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; goto out; } em = btrfs_get_chunk_map(fs_info, logical_block_start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { btrfs_warn(fs_info, "swapfile must have single data profile"); ret = -EINVAL; goto out; } if (device == NULL) { device = em->map_lookup->stripes[0].dev; ret = btrfs_add_swapfile_pin(inode, device, false); if (ret == 1) ret = 0; else if (ret) goto out; } else if (device != em->map_lookup->stripes[0].dev) { btrfs_warn(fs_info, "swapfile must be on one device"); ret = -EINVAL; goto out; } physical_block_start = (em->map_lookup->stripes[0].physical + (logical_block_start - em->start)); len = min(len, em->len - (logical_block_start - em->start)); free_extent_map(em); em = NULL; bg = btrfs_lookup_block_group(fs_info, logical_block_start); if (!bg) { btrfs_warn(fs_info, "could not find block group containing swapfile"); ret = -EINVAL; goto out; } if (!btrfs_inc_block_group_swap_extents(bg)) { btrfs_warn(fs_info, "block group for swapfile at %llu is read-only%s", bg->start, atomic_read(&fs_info->scrubs_running) ? " (scrub running)" : ""); btrfs_put_block_group(bg); ret = -EINVAL; goto out; } ret = btrfs_add_swapfile_pin(inode, bg, true); if (ret) { btrfs_put_block_group(bg); if (ret == 1) ret = 0; else goto out; } if (bsi.block_len && bsi.block_start + bsi.block_len == physical_block_start) { bsi.block_len += len; } else { if (bsi.block_len) { ret = btrfs_add_swap_extent(sis, &bsi); if (ret) goto out; } bsi.start = start; bsi.block_start = physical_block_start; bsi.block_len = len; } start += len; cond_resched(); } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: if (!IS_ERR_OR_NULL(em)) free_extent_map(em); unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); if (ret) return ret; if (device) sis->bdev = device->bdev; *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else static void btrfs_swap_deactivate(struct file *file) { } static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return -EOPNOTSUPP; } #endif /* * Update the number of bytes used in the VFS' inode. When we replace extents in * a range (clone, dedupe, fallocate's zero range), we must update the number of * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls * always get a correct value. */ void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes) { if (add_bytes == del_bytes) return; spin_lock(&inode->lock); if (del_bytes > 0) inode_sub_bytes(&inode->vfs_inode, del_bytes); if (add_bytes > 0) inode_add_bytes(&inode->vfs_inode, add_bytes); spin_unlock(&inode->lock); } static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, .create = btrfs_create, .unlink = btrfs_unlink, .link = btrfs_link, .mkdir = btrfs_mkdir, .rmdir = btrfs_rmdir, .rename = btrfs_rename2, .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, .listxattr = btrfs_listxattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, .fileattr_get = btrfs_fileattr_get, .fileattr_set = btrfs_fileattr_set, }; static const struct file_operations btrfs_dir_file_operations = { .llseek = btrfs_dir_llseek, .read = generic_read_dir, .iterate_shared = btrfs_real_readdir, .open = btrfs_opendir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, }; /* * btrfs doesn't support the bmap operation because swapfiles * use bmap to make a mapping of extents in the file. They assume * these extents won't change over the life of the file and they * use the bmap result to do IO directly to the drive. * * the btrfs bmap call would return logical addresses that aren't * suitable for IO and they also will change frequently as COW * operations happen. So, swapfile + btrfs == corruption. * * For now we're avoiding this by dropping bmap. */ static const struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION .migratepage = btrfs_migratepage, #endif .set_page_dirty = btrfs_set_page_dirty, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, }; static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .listxattr = btrfs_listxattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .fileattr_get = btrfs_fileattr_get, .fileattr_set = btrfs_fileattr_set, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, .listxattr = btrfs_listxattr, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { .get_link = page_get_link, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, .listxattr = btrfs_listxattr, .update_time = btrfs_update_time, }; const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, };
79 91 25 18 76 30 76 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 /* SPDX-License-Identifier: GPL-2.0-only */ #undef TRACE_SYSTEM #define TRACE_SYSTEM erofs #if !defined(_TRACE_EROFS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_EROFS_H #include <linux/tracepoint.h> #include <linux/fs.h> struct erofs_map_blocks; #define show_dev(dev) MAJOR(dev), MINOR(dev) #define show_dev_nid(entry) show_dev(entry->dev), entry->nid #define show_file_type(type) \ __print_symbolic(type, \ { 0, "FILE" }, \ { 1, "DIR" }) #define show_map_flags(flags) __print_flags(flags, "|", \ { EROFS_GET_BLOCKS_RAW, "RAW" }) #define show_mflags(flags) __print_flags(flags, "", \ { EROFS_MAP_MAPPED, "M" }, \ { EROFS_MAP_META, "I" }, \ { EROFS_MAP_ZIPPED, "Z" }) TRACE_EVENT(erofs_lookup, TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags), TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __string(name, dentry->d_name.name ) __field(unsigned int, flags ) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->nid = EROFS_I(dir)->nid; __assign_str(name, dentry->d_name.name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pnid = %llu, name:%s, flags:%x", show_dev_nid(__entry), __get_str(name), __entry->flags) ); TRACE_EVENT(erofs_fill_inode, TP_PROTO(struct inode *inode, int isdir), TP_ARGS(inode, isdir), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __field(erofs_blk_t, blkaddr ) __field(unsigned int, ofs ) __field(int, isdir ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->blkaddr = erofs_blknr(iloc(EROFS_I_SB(inode), __entry->nid)); __entry->ofs = erofs_blkoff(iloc(EROFS_I_SB(inode), __entry->nid)); __entry->isdir = isdir; ), TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u, isdir %d", show_dev_nid(__entry), __entry->blkaddr, __entry->ofs, __entry->isdir) ); TRACE_EVENT(erofs_readpage, TP_PROTO(struct page *page, bool raw), TP_ARGS(page, raw), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __field(int, dir ) __field(pgoff_t, index ) __field(int, uptodate) __field(bool, raw ) ), TP_fast_assign( __entry->dev = page->mapping->host->i_sb->s_dev; __entry->nid = EROFS_I(page->mapping->host)->nid; __entry->dir = S_ISDIR(page->mapping->host->i_mode); __entry->index = page->index; __entry->uptodate = PageUptodate(page); __entry->raw = raw; ), TP_printk("dev = (%d,%d), nid = %llu, %s, index = %lu, uptodate = %d " "raw = %d", show_dev_nid(__entry), show_file_type(__entry->dir), (unsigned long)__entry->index, __entry->uptodate, __entry->raw) ); TRACE_EVENT(erofs_readpages, TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage, bool raw), TP_ARGS(inode, start, nrpage, raw), TP_STRUCT__entry( __field(dev_t, dev ) __field(erofs_nid_t, nid ) __field(pgoff_t, start ) __field(unsigned int, nrpage ) __field(bool, raw ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->start = start; __entry->nrpage = nrpage; __entry->raw = raw; ), TP_printk("dev = (%d,%d), nid = %llu, start = %lu nrpage = %u raw = %d", show_dev_nid(__entry), (unsigned long)__entry->start, __entry->nrpage, __entry->raw) ); DECLARE_EVENT_CLASS(erofs__map_blocks_enter, TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, unsigned int flags), TP_ARGS(inode, map, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( erofs_nid_t, nid ) __field( erofs_off_t, la ) __field( u64, llen ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->la = map->m_la; __entry->llen = map->m_llen; __entry->flags = flags; ), TP_printk("dev = (%d,%d), nid = %llu, la %llu llen %llu flags %s", show_dev_nid(__entry), __entry->la, __entry->llen, __entry->flags ? show_map_flags(__entry->flags) : "NULL") ); DEFINE_EVENT(erofs__map_blocks_enter, erofs_map_blocks_flatmode_enter, TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, unsigned flags), TP_ARGS(inode, map, flags) ); DEFINE_EVENT(erofs__map_blocks_enter, z_erofs_map_blocks_iter_enter, TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, unsigned int flags), TP_ARGS(inode, map, flags) ); DECLARE_EVENT_CLASS(erofs__map_blocks_exit, TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, unsigned int flags, int ret), TP_ARGS(inode, map, flags, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( erofs_nid_t, nid ) __field( unsigned int, flags ) __field( erofs_off_t, la ) __field( erofs_off_t, pa ) __field( u64, llen ) __field( u64, plen ) __field( unsigned int, mflags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; __entry->flags = flags; __entry->la = map->m_la; __entry->pa = map->m_pa; __entry->llen = map->m_llen; __entry->plen = map->m_plen; __entry->mflags = map->m_flags; __entry->ret = ret; ), TP_printk("dev = (%d,%d), nid = %llu, flags %s " "la %llu pa %llu llen %llu plen %llu mflags %s ret %d", show_dev_nid(__entry), __entry->flags ? show_map_flags(__entry->flags) : "NULL", __entry->la, __entry->pa, __entry->llen, __entry->plen, show_mflags(__entry->mflags), __entry->ret) ); DEFINE_EVENT(erofs__map_blocks_exit, erofs_map_blocks_flatmode_exit, TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, unsigned flags, int ret), TP_ARGS(inode, map, flags, ret) ); DEFINE_EVENT(erofs__map_blocks_exit, z_erofs_map_blocks_iter_exit, TP_PROTO(struct inode *inode, struct erofs_map_blocks *map, unsigned int flags, int ret), TP_ARGS(inode, map, flags, ret) ); #endif /* _TRACE_EROFS_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
30 30 30 29 7 8 8 5 6 8 11 7 8 12 30 18 8 7 7 8 9 11 8 5 7 30 22 13 22 30 3 2 1 1 13 13 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 // SPDX-License-Identifier: GPL-2.0-or-later /* * The AEGIS-128 Authenticated-Encryption Algorithm * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. */ #include <crypto/algapi.h> #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <asm/simd.h> #include "aegis.h" #define AEGIS128_NONCE_SIZE 16 #define AEGIS128_STATE_BLOCKS 5 #define AEGIS128_KEY_SIZE 16 #define AEGIS128_MIN_AUTH_SIZE 8 #define AEGIS128_MAX_AUTH_SIZE 16 struct aegis_state { union aegis_block blocks[AEGIS128_STATE_BLOCKS]; }; struct aegis_ctx { union aegis_block key; }; static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_simd); static const union aegis_block crypto_aegis_const[2] = { { .words64 = { cpu_to_le64(U64_C(0x0d08050302010100)), cpu_to_le64(U64_C(0x6279e99059372215)), } }, { .words64 = { cpu_to_le64(U64_C(0xf12fc26d55183ddb)), cpu_to_le64(U64_C(0xdd28b57342311120)), } }, }; static bool aegis128_do_simd(void) { #ifdef CONFIG_CRYPTO_AEGIS128_SIMD if (static_branch_likely(&have_simd)) return crypto_simd_usable(); #endif return false; } static void crypto_aegis128_update(struct aegis_state *state) { union aegis_block tmp; unsigned int i; tmp = state->blocks[AEGIS128_STATE_BLOCKS - 1]; for (i = AEGIS128_STATE_BLOCKS - 1; i > 0; i--) crypto_aegis_aesenc(&state->blocks[i], &state->blocks[i - 1], &state->blocks[i]); crypto_aegis_aesenc(&state->blocks[0], &tmp, &state->blocks[0]); } static void crypto_aegis128_update_a(struct aegis_state *state, const union aegis_block *msg, bool do_simd) { if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && do_simd) { crypto_aegis128_update_simd(state, msg); return; } crypto_aegis128_update(state); crypto_aegis_block_xor(&state->blocks[0], msg); } static void crypto_aegis128_update_u(struct aegis_state *state, const void *msg, bool do_simd) { if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && do_simd) { crypto_aegis128_update_simd(state, msg); return; } crypto_aegis128_update(state); crypto_xor(state->blocks[0].bytes, msg, AEGIS_BLOCK_SIZE); } static void crypto_aegis128_init(struct aegis_state *state, const union aegis_block *key, const u8 *iv) { union aegis_block key_iv; unsigned int i; key_iv = *key; crypto_xor(key_iv.bytes, iv, AEGIS_BLOCK_SIZE); state->blocks[0] = key_iv; state->blocks[1] = crypto_aegis_const[1]; state->blocks[2] = crypto_aegis_const[0]; state->blocks[3] = *key; state->blocks[4] = *key; crypto_aegis_block_xor(&state->blocks[3], &crypto_aegis_const[0]); crypto_aegis_block_xor(&state->blocks[4], &crypto_aegis_const[1]); for (i = 0; i < 5; i++) { crypto_aegis128_update_a(state, key, false); crypto_aegis128_update_a(state, &key_iv, false); } } static void crypto_aegis128_ad(struct aegis_state *state, const u8 *src, unsigned int size, bool do_simd) { if (AEGIS_ALIGNED(src)) { const union aegis_block *src_blk = (const union aegis_block *)src; while (size >= AEGIS_BLOCK_SIZE) { crypto_aegis128_update_a(state, src_blk, do_simd); size -= AEGIS_BLOCK_SIZE; src_blk++; } } else { while (size >= AEGIS_BLOCK_SIZE) { crypto_aegis128_update_u(state, src, do_simd); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; } } } static void crypto_aegis128_wipe_chunk(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size) { memzero_explicit(dst, size); } static void crypto_aegis128_encrypt_chunk(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size) { union aegis_block tmp; if (AEGIS_ALIGNED(src) && AEGIS_ALIGNED(dst)) { while (size >= AEGIS_BLOCK_SIZE) { union aegis_block *dst_blk = (union aegis_block *)dst; const union aegis_block *src_blk = (const union aegis_block *)src; tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis_block_xor(&tmp, src_blk); crypto_aegis128_update_a(state, src_blk, false); *dst_blk = tmp; size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } else { while (size >= AEGIS_BLOCK_SIZE) { tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_xor(tmp.bytes, src, AEGIS_BLOCK_SIZE); crypto_aegis128_update_u(state, src, false); memcpy(dst, tmp.bytes, AEGIS_BLOCK_SIZE); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } if (size > 0) { union aegis_block msg = {}; memcpy(msg.bytes, src, size); tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis128_update_a(state, &msg, false); crypto_aegis_block_xor(&msg, &tmp); memcpy(dst, msg.bytes, size); } } static void crypto_aegis128_decrypt_chunk(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size) { union aegis_block tmp; if (AEGIS_ALIGNED(src) && AEGIS_ALIGNED(dst)) { while (size >= AEGIS_BLOCK_SIZE) { union aegis_block *dst_blk = (union aegis_block *)dst; const union aegis_block *src_blk = (const union aegis_block *)src; tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis_block_xor(&tmp, src_blk); crypto_aegis128_update_a(state, &tmp, false); *dst_blk = tmp; size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } else { while (size >= AEGIS_BLOCK_SIZE) { tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_xor(tmp.bytes, src, AEGIS_BLOCK_SIZE); crypto_aegis128_update_a(state, &tmp, false); memcpy(dst, tmp.bytes, AEGIS_BLOCK_SIZE); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } if (size > 0) { union aegis_block msg = {}; memcpy(msg.bytes, src, size); tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis_block_xor(&msg, &tmp); memset(msg.bytes + size, 0, AEGIS_BLOCK_SIZE - size); crypto_aegis128_update_a(state, &msg, false); memcpy(dst, msg.bytes, size); } } static void crypto_aegis128_process_ad(struct aegis_state *state, struct scatterlist *sg_src, unsigned int assoclen, bool do_simd) { struct scatter_walk walk; union aegis_block buf; unsigned int pos = 0; scatterwalk_start(&walk, sg_src); while (assoclen != 0) { unsigned int size = scatterwalk_clamp(&walk, assoclen); unsigned int left = size; void *mapped = scatterwalk_map(&walk); const u8 *src = (const u8 *)mapped; if (pos + size >= AEGIS_BLOCK_SIZE) { if (pos > 0) { unsigned int fill = AEGIS_BLOCK_SIZE - pos; memcpy(buf.bytes + pos, src, fill); crypto_aegis128_update_a(state, &buf, do_simd); pos = 0; left -= fill; src += fill; } crypto_aegis128_ad(state, src, left, do_simd); src += left & ~(AEGIS_BLOCK_SIZE - 1); left &= AEGIS_BLOCK_SIZE - 1; } memcpy(buf.bytes + pos, src, left); pos += left; assoclen -= size; scatterwalk_unmap(mapped); scatterwalk_advance(&walk, size); scatterwalk_done(&walk, 0, assoclen); } if (pos > 0) { memset(buf.bytes + pos, 0, AEGIS_BLOCK_SIZE - pos); crypto_aegis128_update_a(state, &buf, do_simd); } } static __always_inline int crypto_aegis128_process_crypt(struct aegis_state *state, struct skcipher_walk *walk, void (*crypt)(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size)) { int err = 0; while (walk->nbytes) { unsigned int nbytes = walk->nbytes; if (nbytes < walk->total) nbytes = round_down(nbytes, walk->stride); crypt(state, walk->dst.virt.addr, walk->src.virt.addr, nbytes); err = skcipher_walk_done(walk, walk->nbytes - nbytes); } return err; } static void crypto_aegis128_final(struct aegis_state *state, union aegis_block *tag_xor, u64 assoclen, u64 cryptlen) { u64 assocbits = assoclen * 8; u64 cryptbits = cryptlen * 8; union aegis_block tmp; unsigned int i; tmp.words64[0] = cpu_to_le64(assocbits); tmp.words64[1] = cpu_to_le64(cryptbits); crypto_aegis_block_xor(&tmp, &state->blocks[3]); for (i = 0; i < 7; i++) crypto_aegis128_update_a(state, &tmp, false); for (i = 0; i < AEGIS128_STATE_BLOCKS; i++) crypto_aegis_block_xor(tag_xor, &state->blocks[i]); } static int crypto_aegis128_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct aegis_ctx *ctx = crypto_aead_ctx(aead); if (keylen != AEGIS128_KEY_SIZE) return -EINVAL; memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); return 0; } static int crypto_aegis128_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { if (authsize > AEGIS128_MAX_AUTH_SIZE) return -EINVAL; if (authsize < AEGIS128_MIN_AUTH_SIZE) return -EINVAL; return 0; } static int crypto_aegis128_encrypt_generic(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); struct aegis_ctx *ctx = crypto_aead_ctx(tfm); unsigned int cryptlen = req->cryptlen; struct skcipher_walk walk; struct aegis_state state; skcipher_walk_aead_encrypt(&walk, req, false); crypto_aegis128_init(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, false); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_encrypt_chunk); crypto_aegis128_final(&state, &tag, req->assoclen, cryptlen); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); return 0; } static int crypto_aegis128_decrypt_generic(struct aead_request *req) { static const u8 zeros[AEGIS128_MAX_AUTH_SIZE] = {}; struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen - authsize; struct aegis_ctx *ctx = crypto_aead_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_init(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, false); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_decrypt_chunk); crypto_aegis128_final(&state, &tag, req->assoclen, cryptlen); if (unlikely(crypto_memneq(tag.bytes, zeros, authsize))) { /* * From Chapter 4. 'Security Analysis' of the AEGIS spec [0] * * "3. If verification fails, the decrypted plaintext and the * wrong authentication tag should not be given as output." * * [0] https://competitions.cr.yp.to/round3/aegisv11.pdf */ skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_process_crypt(NULL, &walk, crypto_aegis128_wipe_chunk); memzero_explicit(&tag, sizeof(tag)); return -EBADMSG; } return 0; } static int crypto_aegis128_encrypt_simd(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); struct aegis_ctx *ctx = crypto_aead_ctx(tfm); unsigned int cryptlen = req->cryptlen; struct skcipher_walk walk; struct aegis_state state; if (!aegis128_do_simd()) return crypto_aegis128_encrypt_generic(req); skcipher_walk_aead_encrypt(&walk, req, false); crypto_aegis128_init_simd(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, true); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_encrypt_chunk_simd); crypto_aegis128_final_simd(&state, &tag, req->assoclen, cryptlen, 0); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); return 0; } static int crypto_aegis128_decrypt_simd(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen - authsize; struct aegis_ctx *ctx = crypto_aead_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; if (!aegis128_do_simd()) return crypto_aegis128_decrypt_generic(req); scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_init_simd(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, true); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_decrypt_chunk_simd); if (unlikely(crypto_aegis128_final_simd(&state, &tag, req->assoclen, cryptlen, authsize))) { skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_process_crypt(NULL, &walk, crypto_aegis128_wipe_chunk); return -EBADMSG; } return 0; } static struct aead_alg crypto_aegis128_alg_generic = { .setkey = crypto_aegis128_setkey, .setauthsize = crypto_aegis128_setauthsize, .encrypt = crypto_aegis128_encrypt_generic, .decrypt = crypto_aegis128_decrypt_generic, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, .chunksize = AEGIS_BLOCK_SIZE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aegis_ctx), .base.cra_alignmask = 0, .base.cra_priority = 100, .base.cra_name = "aegis128", .base.cra_driver_name = "aegis128-generic", .base.cra_module = THIS_MODULE, }; static struct aead_alg crypto_aegis128_alg_simd = { .setkey = crypto_aegis128_setkey, .setauthsize = crypto_aegis128_setauthsize, .encrypt = crypto_aegis128_encrypt_simd, .decrypt = crypto_aegis128_decrypt_simd, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, .chunksize = AEGIS_BLOCK_SIZE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aegis_ctx), .base.cra_alignmask = 0, .base.cra_priority = 200, .base.cra_name = "aegis128", .base.cra_driver_name = "aegis128-simd", .base.cra_module = THIS_MODULE, }; static int __init crypto_aegis128_module_init(void) { int ret; ret = crypto_register_aead(&crypto_aegis128_alg_generic); if (ret) return ret; if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && crypto_aegis128_have_simd()) { ret = crypto_register_aead(&crypto_aegis128_alg_simd); if (ret) { crypto_unregister_aead(&crypto_aegis128_alg_generic); return ret; } static_branch_enable(&have_simd); } return 0; } static void __exit crypto_aegis128_module_exit(void) { if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && crypto_aegis128_have_simd()) crypto_unregister_aead(&crypto_aegis128_alg_simd); crypto_unregister_aead(&crypto_aegis128_alg_generic); } subsys_initcall(crypto_aegis128_module_init); module_exit(crypto_aegis128_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm"); MODULE_ALIAS_CRYPTO("aegis128"); MODULE_ALIAS_CRYPTO("aegis128-generic"); MODULE_ALIAS_CRYPTO("aegis128-simd");
189 149 217 192 190 170 209 170 165 170 8 7 146 164 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 // SPDX-License-Identifier: GPL-2.0-or-later /* * sysfile.c * * Initialize, read, write, etc. system files. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "dir.h" #include "inode.h" #include "journal.h" #include "sysfile.h" #include "buffer_head_io.h" static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, int type, u32 slot); #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES]; #endif static inline int is_global_system_inode(int type) { return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE && type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; } static struct inode **get_local_system_inode(struct ocfs2_super *osb, int type, u32 slot) { int index; struct inode **local_system_inodes, **free = NULL; BUG_ON(slot == OCFS2_INVALID_SLOT); BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE || type > OCFS2_LAST_LOCAL_SYSTEM_INODE); spin_lock(&osb->osb_lock); local_system_inodes = osb->local_system_inodes; spin_unlock(&osb->osb_lock); if (unlikely(!local_system_inodes)) { local_system_inodes = kzalloc(array3_size(sizeof(struct inode *), NUM_LOCAL_SYSTEM_INODES, osb->max_slots), GFP_NOFS); if (!local_system_inodes) { mlog_errno(-ENOMEM); /* * return NULL here so that ocfs2_get_sytem_file_inodes * will try to create an inode and use it. We will try * to initialize local_system_inodes next time. */ return NULL; } spin_lock(&osb->osb_lock); if (osb->local_system_inodes) { /* Someone has initialized it for us. */ free = local_system_inodes; local_system_inodes = osb->local_system_inodes; } else osb->local_system_inodes = local_system_inodes; spin_unlock(&osb->osb_lock); kfree(free); } index = (slot * NUM_LOCAL_SYSTEM_INODES) + (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE); return &local_system_inodes[index]; } struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, int type, u32 slot) { struct inode *inode = NULL; struct inode **arr = NULL; /* avoid the lookup if cached in local system file array */ if (is_global_system_inode(type)) { arr = &(osb->global_system_inodes[type]); } else arr = get_local_system_inode(osb, type, slot); mutex_lock(&osb->system_file_mutex); if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ inode = igrab(inode); mutex_unlock(&osb->system_file_mutex); BUG_ON(!inode); return inode; } /* this gets one ref thru iget */ inode = _ocfs2_get_system_file_inode(osb, type, slot); /* add one more if putting into array for first time */ if (arr && inode) { *arr = igrab(inode); BUG_ON(!*arr); } mutex_unlock(&osb->system_file_mutex); return inode; } static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, int type, u32 slot) { char namebuf[40]; struct inode *inode = NULL; u64 blkno; int status = 0; ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, strlen(namebuf), &blkno); if (status < 0) { goto bail; } inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE, type); if (IS_ERR(inode)) { mlog_errno(PTR_ERR(inode)); inode = NULL; goto bail; } #ifdef CONFIG_DEBUG_LOCK_ALLOC if (type == LOCAL_USER_QUOTA_SYSTEM_INODE || type == LOCAL_GROUP_QUOTA_SYSTEM_INODE || type == JOURNAL_SYSTEM_INODE) { /* Ignore inode lock on these inodes as the lock does not * really belong to any process and lockdep cannot handle * that */ OCFS2_I(inode)->ip_inode_lockres.l_lockdep_map.key = NULL; } else { lockdep_init_map(&OCFS2_I(inode)->ip_inode_lockres. l_lockdep_map, ocfs2_system_inodes[type].si_name, &ocfs2_sysfile_cluster_lock_key[type], 0); } #endif bail: return inode; }
4 2 2 2 2 2 4 4 1 3 1 1 3 3 4 3 1 49 21 2 24 1 1 20 5 3 2 9 2 9 3 4 2 4 1 5 7 4 3 1 3 1 4 1 2 1 2 2 2 2 2 1 1 2 2 2 2 1 1 1 2 1 1 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 // SPDX-License-Identifier: GPL-2.0+ /* * cdc-acm.c * * Copyright (c) 1999 Armin Fuerst <fuerst@in.tum.de> * Copyright (c) 1999 Pavel Machek <pavel@ucw.cz> * Copyright (c) 1999 Johannes Erdfelt <johannes@erdfelt.com> * Copyright (c) 2000 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2004 Oliver Neukum <oliver@neukum.name> * Copyright (c) 2005 David Kubicek <dave@awk.cz> * Copyright (c) 2011 Johan Hovold <jhovold@gmail.com> * * USB Abstract Control Model driver for USB modems and ISDN adapters * * Sponsored by SuSE */ #undef DEBUG #undef VERBOSE_DEBUG #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/log2.h> #include <linux/tty.h> #include <linux/serial.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <asm/byteorder.h> #include <asm/unaligned.h> #include <linux/idr.h> #include <linux/list.h> #include "cdc-acm.h" #define DRIVER_AUTHOR "Armin Fuerst, Pavel Machek, Johannes Erdfelt, Vojtech Pavlik, David Kubicek, Johan Hovold" #define DRIVER_DESC "USB Abstract Control Model driver for USB modems and ISDN adapters" static struct usb_driver acm_driver; static struct tty_driver *acm_tty_driver; static DEFINE_IDR(acm_minors); static DEFINE_MUTEX(acm_minors_lock); static void acm_tty_set_termios(struct tty_struct *tty, struct ktermios *termios_old); /* * acm_minors accessors */ /* * Look up an ACM structure by minor. If found and not disconnected, increment * its refcount and return it with its mutex held. */ static struct acm *acm_get_by_minor(unsigned int minor) { struct acm *acm; mutex_lock(&acm_minors_lock); acm = idr_find(&acm_minors, minor); if (acm) { mutex_lock(&acm->mutex); if (acm->disconnected) { mutex_unlock(&acm->mutex); acm = NULL; } else { tty_port_get(&acm->port); mutex_unlock(&acm->mutex); } } mutex_unlock(&acm_minors_lock); return acm; } /* * Try to find an available minor number and if found, associate it with 'acm'. */ static int acm_alloc_minor(struct acm *acm) { int minor; mutex_lock(&acm_minors_lock); minor = idr_alloc(&acm_minors, acm, 0, ACM_TTY_MINORS, GFP_KERNEL); mutex_unlock(&acm_minors_lock); return minor; } /* Release the minor number associated with 'acm'. */ static void acm_release_minor(struct acm *acm) { mutex_lock(&acm_minors_lock); idr_remove(&acm_minors, acm->minor); mutex_unlock(&acm_minors_lock); } /* * Functions for ACM control messages. */ static int acm_ctrl_msg(struct acm *acm, int request, int value, void *buf, int len) { int retval; retval = usb_autopm_get_interface(acm->control); if (retval) return retval; retval = usb_control_msg(acm->dev, usb_sndctrlpipe(acm->dev, 0), request, USB_RT_ACM, value, acm->control->altsetting[0].desc.bInterfaceNumber, buf, len, 5000); dev_dbg(&acm->control->dev, "%s - rq 0x%02x, val %#x, len %#x, result %d\n", __func__, request, value, len, retval); usb_autopm_put_interface(acm->control); return retval < 0 ? retval : 0; } /* devices aren't required to support these requests. * the cdc acm descriptor tells whether they do... */ static inline int acm_set_control(struct acm *acm, int control) { if (acm->quirks & QUIRK_CONTROL_LINE_STATE) return -EOPNOTSUPP; return acm_ctrl_msg(acm, USB_CDC_REQ_SET_CONTROL_LINE_STATE, control, NULL, 0); } #define acm_set_line(acm, line) \ acm_ctrl_msg(acm, USB_CDC_REQ_SET_LINE_CODING, 0, line, sizeof *(line)) #define acm_send_break(acm, ms) \ acm_ctrl_msg(acm, USB_CDC_REQ_SEND_BREAK, ms, NULL, 0) static void acm_poison_urbs(struct acm *acm) { int i; usb_poison_urb(acm->ctrlurb); for (i = 0; i < ACM_NW; i++) usb_poison_urb(acm->wb[i].urb); for (i = 0; i < acm->rx_buflimit; i++) usb_poison_urb(acm->read_urbs[i]); } static void acm_unpoison_urbs(struct acm *acm) { int i; for (i = 0; i < acm->rx_buflimit; i++) usb_unpoison_urb(acm->read_urbs[i]); for (i = 0; i < ACM_NW; i++) usb_unpoison_urb(acm->wb[i].urb); usb_unpoison_urb(acm->ctrlurb); } /* * Write buffer management. * All of these assume proper locks taken by the caller. */ static int acm_wb_alloc(struct acm *acm) { int i, wbn; struct acm_wb *wb; wbn = 0; i = 0; for (;;) { wb = &acm->wb[wbn]; if (!wb->use) { wb->use = true; wb->len = 0; return wbn; } wbn = (wbn + 1) % ACM_NW; if (++i >= ACM_NW) return -1; } } static int acm_wb_is_avail(struct acm *acm) { int i, n; unsigned long flags; n = ACM_NW; spin_lock_irqsave(&acm->write_lock, flags); for (i = 0; i < ACM_NW; i++) if(acm->wb[i].use) n--; spin_unlock_irqrestore(&acm->write_lock, flags); return n; } /* * Finish write. Caller must hold acm->write_lock */ static void acm_write_done(struct acm *acm, struct acm_wb *wb) { wb->use = false; acm->transmitting--; usb_autopm_put_interface_async(acm->control); } /* * Poke write. * * the caller is responsible for locking */ static int acm_start_wb(struct acm *acm, struct acm_wb *wb) { int rc; acm->transmitting++; wb->urb->transfer_buffer = wb->buf; wb->urb->transfer_dma = wb->dmah; wb->urb->transfer_buffer_length = wb->len; wb->urb->dev = acm->dev; rc = usb_submit_urb(wb->urb, GFP_ATOMIC); if (rc < 0) { if (rc != -EPERM) dev_err(&acm->data->dev, "%s - usb_submit_urb(write bulk) failed: %d\n", __func__, rc); acm_write_done(acm, wb); } return rc; } /* * attributes exported through sysfs */ static ssize_t bmCapabilities_show (struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct acm *acm = usb_get_intfdata(intf); return sprintf(buf, "%d", acm->ctrl_caps); } static DEVICE_ATTR_RO(bmCapabilities); static ssize_t wCountryCodes_show (struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct acm *acm = usb_get_intfdata(intf); memcpy(buf, acm->country_codes, acm->country_code_size); return acm->country_code_size; } static DEVICE_ATTR_RO(wCountryCodes); static ssize_t iCountryCodeRelDate_show (struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct acm *acm = usb_get_intfdata(intf); return sprintf(buf, "%d", acm->country_rel_date); } static DEVICE_ATTR_RO(iCountryCodeRelDate); /* * Interrupt handlers for various ACM device responses */ static void acm_process_notification(struct acm *acm, unsigned char *buf) { int newctrl; int difference; unsigned long flags; struct usb_cdc_notification *dr = (struct usb_cdc_notification *)buf; unsigned char *data = buf + sizeof(struct usb_cdc_notification); switch (dr->bNotificationType) { case USB_CDC_NOTIFY_NETWORK_CONNECTION: dev_dbg(&acm->control->dev, "%s - network connection: %d\n", __func__, dr->wValue); break; case USB_CDC_NOTIFY_SERIAL_STATE: if (le16_to_cpu(dr->wLength) != 2) { dev_dbg(&acm->control->dev, "%s - malformed serial state\n", __func__); break; } newctrl = get_unaligned_le16(data); dev_dbg(&acm->control->dev, "%s - serial state: 0x%x\n", __func__, newctrl); if (!acm->clocal && (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) { dev_dbg(&acm->control->dev, "%s - calling hangup\n", __func__); tty_port_tty_hangup(&acm->port, false); } difference = acm->ctrlin ^ newctrl; spin_lock_irqsave(&acm->read_lock, flags); acm->ctrlin = newctrl; acm->oldcount = acm->iocount; if (difference & ACM_CTRL_DSR) acm->iocount.dsr++; if (difference & ACM_CTRL_DCD) acm->iocount.dcd++; if (newctrl & ACM_CTRL_BRK) { acm->iocount.brk++; tty_insert_flip_char(&acm->port, 0, TTY_BREAK); } if (newctrl & ACM_CTRL_RI) acm->iocount.rng++; if (newctrl & ACM_CTRL_FRAMING) acm->iocount.frame++; if (newctrl & ACM_CTRL_PARITY) acm->iocount.parity++; if (newctrl & ACM_CTRL_OVERRUN) acm->iocount.overrun++; spin_unlock_irqrestore(&acm->read_lock, flags); if (newctrl & ACM_CTRL_BRK) tty_flip_buffer_push(&acm->port); if (difference) wake_up_all(&acm->wioctl); break; default: dev_dbg(&acm->control->dev, "%s - unknown notification %d received: index %d len %d\n", __func__, dr->bNotificationType, dr->wIndex, dr->wLength); } } /* control interface reports status changes with "interrupt" transfers */ static void acm_ctrl_irq(struct urb *urb) { struct acm *acm = urb->context; struct usb_cdc_notification *dr; unsigned int current_size = urb->actual_length; unsigned int expected_size, copy_size, alloc_size; int retval; int status = urb->status; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&acm->control->dev, "%s - urb shutting down with status: %d\n", __func__, status); return; default: dev_dbg(&acm->control->dev, "%s - nonzero urb status received: %d\n", __func__, status); goto exit; } usb_mark_last_busy(acm->dev); if (acm->nb_index == 0) { /* * The first chunk of a message must contain at least the * notification header with the length field, otherwise we * can't get an expected_size. */ if (current_size < sizeof(struct usb_cdc_notification)) { dev_dbg(&acm->control->dev, "urb too short\n"); goto exit; } dr = urb->transfer_buffer; } else { dr = (struct usb_cdc_notification *)acm->notification_buffer; } /* size = notification-header + (optional) data */ expected_size = sizeof(struct usb_cdc_notification) + le16_to_cpu(dr->wLength); if (acm->nb_index != 0 || current_size < expected_size) { /* notification is transmitted fragmented, reassemble */ if (acm->nb_size < expected_size) { u8 *new_buffer; alloc_size = roundup_pow_of_two(expected_size); /* Final freeing is done on disconnect. */ new_buffer = krealloc(acm->notification_buffer, alloc_size, GFP_ATOMIC); if (!new_buffer) { acm->nb_index = 0; goto exit; } acm->notification_buffer = new_buffer; acm->nb_size = alloc_size; dr = (struct usb_cdc_notification *)acm->notification_buffer; } copy_size = min(current_size, expected_size - acm->nb_index); memcpy(&acm->notification_buffer[acm->nb_index], urb->transfer_buffer, copy_size); acm->nb_index += copy_size; current_size = acm->nb_index; } if (current_size >= expected_size) { /* notification complete */ acm_process_notification(acm, (unsigned char *)dr); acm->nb_index = 0; } exit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval && retval != -EPERM && retval != -ENODEV) dev_err(&acm->control->dev, "%s - usb_submit_urb failed: %d\n", __func__, retval); else dev_vdbg(&acm->control->dev, "control resubmission terminated %d\n", retval); } static int acm_submit_read_urb(struct acm *acm, int index, gfp_t mem_flags) { int res; if (!test_and_clear_bit(index, &acm->read_urbs_free)) return 0; res = usb_submit_urb(acm->read_urbs[index], mem_flags); if (res) { if (res != -EPERM && res != -ENODEV) { dev_err(&acm->data->dev, "urb %d failed submission with %d\n", index, res); } else { dev_vdbg(&acm->data->dev, "intended failure %d\n", res); } set_bit(index, &acm->read_urbs_free); return res; } else { dev_vdbg(&acm->data->dev, "submitted urb %d\n", index); } return 0; } static int acm_submit_read_urbs(struct acm *acm, gfp_t mem_flags) { int res; int i; for (i = 0; i < acm->rx_buflimit; ++i) { res = acm_submit_read_urb(acm, i, mem_flags); if (res) return res; } return 0; } static void acm_process_read_urb(struct acm *acm, struct urb *urb) { unsigned long flags; if (!urb->actual_length) return; spin_lock_irqsave(&acm->read_lock, flags); tty_insert_flip_string(&acm->port, urb->transfer_buffer, urb->actual_length); spin_unlock_irqrestore(&acm->read_lock, flags); tty_flip_buffer_push(&acm->port); } static void acm_read_bulk_callback(struct urb *urb) { struct acm_rb *rb = urb->context; struct acm *acm = rb->instance; int status = urb->status; bool stopped = false; bool stalled = false; bool cooldown = false; dev_vdbg(&acm->data->dev, "got urb %d, len %d, status %d\n", rb->index, urb->actual_length, status); switch (status) { case 0: usb_mark_last_busy(acm->dev); acm_process_read_urb(acm, urb); break; case -EPIPE: set_bit(EVENT_RX_STALL, &acm->flags); stalled = true; break; case -ENOENT: case -ECONNRESET: case -ESHUTDOWN: dev_dbg(&acm->data->dev, "%s - urb shutting down with status: %d\n", __func__, status); stopped = true; break; case -EOVERFLOW: case -EPROTO: dev_dbg(&acm->data->dev, "%s - cooling babbling device\n", __func__); usb_mark_last_busy(acm->dev); set_bit(rb->index, &acm->urbs_in_error_delay); set_bit(ACM_ERROR_DELAY, &acm->flags); cooldown = true; break; default: dev_dbg(&acm->data->dev, "%s - nonzero urb status received: %d\n", __func__, status); break; } /* * Make sure URB processing is done before marking as free to avoid * racing with unthrottle() on another CPU. Matches the barriers * implied by the test_and_clear_bit() in acm_submit_read_urb(). */ smp_mb__before_atomic(); set_bit(rb->index, &acm->read_urbs_free); /* * Make sure URB is marked as free before checking the throttled flag * to avoid racing with unthrottle() on another CPU. Matches the * smp_mb() in unthrottle(). */ smp_mb__after_atomic(); if (stopped || stalled || cooldown) { if (stalled) schedule_delayed_work(&acm->dwork, 0); else if (cooldown) schedule_delayed_work(&acm->dwork, HZ / 2); return; } if (test_bit(ACM_THROTTLED, &acm->flags)) return; acm_submit_read_urb(acm, rb->index, GFP_ATOMIC); } /* data interface wrote those outgoing bytes */ static void acm_write_bulk(struct urb *urb) { struct acm_wb *wb = urb->context; struct acm *acm = wb->instance; unsigned long flags; int status = urb->status; if (status || (urb->actual_length != urb->transfer_buffer_length)) dev_vdbg(&acm->data->dev, "wrote len %d/%d, status %d\n", urb->actual_length, urb->transfer_buffer_length, status); spin_lock_irqsave(&acm->write_lock, flags); acm_write_done(acm, wb); spin_unlock_irqrestore(&acm->write_lock, flags); set_bit(EVENT_TTY_WAKEUP, &acm->flags); schedule_delayed_work(&acm->dwork, 0); } static void acm_softint(struct work_struct *work) { int i; struct acm *acm = container_of(work, struct acm, dwork.work); if (test_bit(EVENT_RX_STALL, &acm->flags)) { smp_mb(); /* against acm_suspend() */ if (!acm->susp_count) { for (i = 0; i < acm->rx_buflimit; i++) usb_kill_urb(acm->read_urbs[i]); usb_clear_halt(acm->dev, acm->in); acm_submit_read_urbs(acm, GFP_KERNEL); clear_bit(EVENT_RX_STALL, &acm->flags); } } if (test_and_clear_bit(ACM_ERROR_DELAY, &acm->flags)) { for (i = 0; i < acm->rx_buflimit; i++) if (test_and_clear_bit(i, &acm->urbs_in_error_delay)) acm_submit_read_urb(acm, i, GFP_KERNEL); } if (test_and_clear_bit(EVENT_TTY_WAKEUP, &acm->flags)) tty_port_tty_wakeup(&acm->port); } /* * TTY handlers */ static int acm_tty_install(struct tty_driver *driver, struct tty_struct *tty) { struct acm *acm; int retval; acm = acm_get_by_minor(tty->index); if (!acm) return -ENODEV; retval = tty_standard_install(driver, tty); if (retval) goto error_init_termios; /* * Suppress initial echoing for some devices which might send data * immediately after acm driver has been installed. */ if (acm->quirks & DISABLE_ECHO) tty->termios.c_lflag &= ~ECHO; tty->driver_data = acm; return 0; error_init_termios: tty_port_put(&acm->port); return retval; } static int acm_tty_open(struct tty_struct *tty, struct file *filp) { struct acm *acm = tty->driver_data; return tty_port_open(&acm->port, tty, filp); } static void acm_port_dtr_rts(struct tty_port *port, int raise) { struct acm *acm = container_of(port, struct acm, port); int val; int res; if (raise) val = ACM_CTRL_DTR | ACM_CTRL_RTS; else val = 0; /* FIXME: add missing ctrlout locking throughout driver */ acm->ctrlout = val; res = acm_set_control(acm, val); if (res && (acm->ctrl_caps & USB_CDC_CAP_LINE)) /* This is broken in too many devices to spam the logs */ dev_dbg(&acm->control->dev, "failed to set dtr/rts\n"); } static int acm_port_activate(struct tty_port *port, struct tty_struct *tty) { struct acm *acm = container_of(port, struct acm, port); int retval = -ENODEV; int i; mutex_lock(&acm->mutex); if (acm->disconnected) goto disconnected; retval = usb_autopm_get_interface(acm->control); if (retval) goto error_get_interface; /* * FIXME: Why do we need this? Allocating 64K of physically contiguous * memory is really nasty... */ set_bit(TTY_NO_WRITE_SPLIT, &tty->flags); acm->control->needs_remote_wakeup = 1; acm->ctrlurb->dev = acm->dev; retval = usb_submit_urb(acm->ctrlurb, GFP_KERNEL); if (retval) { dev_err(&acm->control->dev, "%s - usb_submit_urb(ctrl irq) failed\n", __func__); goto error_submit_urb; } acm_tty_set_termios(tty, NULL); /* * Unthrottle device in case the TTY was closed while throttled. */ clear_bit(ACM_THROTTLED, &acm->flags); retval = acm_submit_read_urbs(acm, GFP_KERNEL); if (retval) goto error_submit_read_urbs; usb_autopm_put_interface(acm->control); mutex_unlock(&acm->mutex); return 0; error_submit_read_urbs: for (i = 0; i < acm->rx_buflimit; i++) usb_kill_urb(acm->read_urbs[i]); usb_kill_urb(acm->ctrlurb); error_submit_urb: usb_autopm_put_interface(acm->control); error_get_interface: disconnected: mutex_unlock(&acm->mutex); return usb_translate_errors(retval); } static void acm_port_destruct(struct tty_port *port) { struct acm *acm = container_of(port, struct acm, port); if (acm->minor != ACM_MINOR_INVALID) acm_release_minor(acm); usb_put_intf(acm->control); kfree(acm->country_codes); kfree(acm); } static void acm_port_shutdown(struct tty_port *port) { struct acm *acm = container_of(port, struct acm, port); struct urb *urb; struct acm_wb *wb; /* * Need to grab write_lock to prevent race with resume, but no need to * hold it due to the tty-port initialised flag. */ acm_poison_urbs(acm); spin_lock_irq(&acm->write_lock); spin_unlock_irq(&acm->write_lock); usb_autopm_get_interface_no_resume(acm->control); acm->control->needs_remote_wakeup = 0; usb_autopm_put_interface(acm->control); for (;;) { urb = usb_get_from_anchor(&acm->delayed); if (!urb) break; wb = urb->context; wb->use = false; usb_autopm_put_interface_async(acm->control); } acm_unpoison_urbs(acm); } static void acm_tty_cleanup(struct tty_struct *tty) { struct acm *acm = tty->driver_data; tty_port_put(&acm->port); } static void acm_tty_hangup(struct tty_struct *tty) { struct acm *acm = tty->driver_data; tty_port_hangup(&acm->port); } static void acm_tty_close(struct tty_struct *tty, struct file *filp) { struct acm *acm = tty->driver_data; tty_port_close(&acm->port, tty, filp); } static int acm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count) { struct acm *acm = tty->driver_data; int stat; unsigned long flags; int wbn; struct acm_wb *wb; if (!count) return 0; dev_vdbg(&acm->data->dev, "%d bytes from tty layer\n", count); spin_lock_irqsave(&acm->write_lock, flags); wbn = acm_wb_alloc(acm); if (wbn < 0) { spin_unlock_irqrestore(&acm->write_lock, flags); return 0; } wb = &acm->wb[wbn]; if (!acm->dev) { wb->use = false; spin_unlock_irqrestore(&acm->write_lock, flags); return -ENODEV; } count = (count > acm->writesize) ? acm->writesize : count; dev_vdbg(&acm->data->dev, "writing %d bytes\n", count); memcpy(wb->buf, buf, count); wb->len = count; stat = usb_autopm_get_interface_async(acm->control); if (stat) { wb->use = false; spin_unlock_irqrestore(&acm->write_lock, flags); return stat; } if (acm->susp_count) { usb_anchor_urb(wb->urb, &acm->delayed); spin_unlock_irqrestore(&acm->write_lock, flags); return count; } stat = acm_start_wb(acm, wb); spin_unlock_irqrestore(&acm->write_lock, flags); if (stat < 0) return stat; return count; } static unsigned int acm_tty_write_room(struct tty_struct *tty) { struct acm *acm = tty->driver_data; /* * Do not let the line discipline to know that we have a reserve, * or it might get too enthusiastic. */ return acm_wb_is_avail(acm) ? acm->writesize : 0; } static unsigned int acm_tty_chars_in_buffer(struct tty_struct *tty) { struct acm *acm = tty->driver_data; /* * if the device was unplugged then any remaining characters fell out * of the connector ;) */ if (acm->disconnected) return 0; /* * This is inaccurate (overcounts), but it works. */ return (ACM_NW - acm_wb_is_avail(acm)) * acm->writesize; } static void acm_tty_throttle(struct tty_struct *tty) { struct acm *acm = tty->driver_data; set_bit(ACM_THROTTLED, &acm->flags); } static void acm_tty_unthrottle(struct tty_struct *tty) { struct acm *acm = tty->driver_data; clear_bit(ACM_THROTTLED, &acm->flags); /* Matches the smp_mb__after_atomic() in acm_read_bulk_callback(). */ smp_mb(); acm_submit_read_urbs(acm, GFP_KERNEL); } static int acm_tty_break_ctl(struct tty_struct *tty, int state) { struct acm *acm = tty->driver_data; int retval; if (!(acm->ctrl_caps & USB_CDC_CAP_BRK)) return -EOPNOTSUPP; retval = acm_send_break(acm, state ? 0xffff : 0); if (retval < 0) dev_dbg(&acm->control->dev, "%s - send break failed\n", __func__); return retval; } static int acm_tty_tiocmget(struct tty_struct *tty) { struct acm *acm = tty->driver_data; return (acm->ctrlout & ACM_CTRL_DTR ? TIOCM_DTR : 0) | (acm->ctrlout & ACM_CTRL_RTS ? TIOCM_RTS : 0) | (acm->ctrlin & ACM_CTRL_DSR ? TIOCM_DSR : 0) | (acm->ctrlin & ACM_CTRL_RI ? TIOCM_RI : 0) | (acm->ctrlin & ACM_CTRL_DCD ? TIOCM_CD : 0) | TIOCM_CTS; } static int acm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct acm *acm = tty->driver_data; unsigned int newctrl; newctrl = acm->ctrlout; set = (set & TIOCM_DTR ? ACM_CTRL_DTR : 0) | (set & TIOCM_RTS ? ACM_CTRL_RTS : 0); clear = (clear & TIOCM_DTR ? ACM_CTRL_DTR : 0) | (clear & TIOCM_RTS ? ACM_CTRL_RTS : 0); newctrl = (newctrl & ~clear) | set; if (acm->ctrlout == newctrl) return 0; return acm_set_control(acm, acm->ctrlout = newctrl); } static int get_serial_info(struct tty_struct *tty, struct serial_struct *ss) { struct acm *acm = tty->driver_data; ss->line = acm->minor; mutex_lock(&acm->port.mutex); ss->close_delay = jiffies_to_msecs(acm->port.close_delay) / 10; ss->closing_wait = acm->port.closing_wait == ASYNC_CLOSING_WAIT_NONE ? ASYNC_CLOSING_WAIT_NONE : jiffies_to_msecs(acm->port.closing_wait) / 10; mutex_unlock(&acm->port.mutex); return 0; } static int set_serial_info(struct tty_struct *tty, struct serial_struct *ss) { struct acm *acm = tty->driver_data; unsigned int closing_wait, close_delay; int retval = 0; close_delay = msecs_to_jiffies(ss->close_delay * 10); closing_wait = ss->closing_wait == ASYNC_CLOSING_WAIT_NONE ? ASYNC_CLOSING_WAIT_NONE : msecs_to_jiffies(ss->closing_wait * 10); mutex_lock(&acm->port.mutex); if (!capable(CAP_SYS_ADMIN)) { if ((close_delay != acm->port.close_delay) || (closing_wait != acm->port.closing_wait)) retval = -EPERM; } else { acm->port.close_delay = close_delay; acm->port.closing_wait = closing_wait; } mutex_unlock(&acm->port.mutex); return retval; } static int wait_serial_change(struct acm *acm, unsigned long arg) { int rv = 0; DECLARE_WAITQUEUE(wait, current); struct async_icount old, new; do { spin_lock_irq(&acm->read_lock); old = acm->oldcount; new = acm->iocount; acm->oldcount = new; spin_unlock_irq(&acm->read_lock); if ((arg & TIOCM_DSR) && old.dsr != new.dsr) break; if ((arg & TIOCM_CD) && old.dcd != new.dcd) break; if ((arg & TIOCM_RI) && old.rng != new.rng) break; add_wait_queue(&acm->wioctl, &wait); set_current_state(TASK_INTERRUPTIBLE); schedule(); remove_wait_queue(&acm->wioctl, &wait); if (acm->disconnected) { if (arg & TIOCM_CD) break; else rv = -ENODEV; } else { if (signal_pending(current)) rv = -ERESTARTSYS; } } while (!rv); return rv; } static int acm_tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { struct acm *acm = tty->driver_data; icount->dsr = acm->iocount.dsr; icount->rng = acm->iocount.rng; icount->dcd = acm->iocount.dcd; icount->frame = acm->iocount.frame; icount->overrun = acm->iocount.overrun; icount->parity = acm->iocount.parity; icount->brk = acm->iocount.brk; return 0; } static int acm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct acm *acm = tty->driver_data; int rv = -ENOIOCTLCMD; switch (cmd) { case TIOCMIWAIT: rv = usb_autopm_get_interface(acm->control); if (rv < 0) { rv = -EIO; break; } rv = wait_serial_change(acm, arg); usb_autopm_put_interface(acm->control); break; } return rv; } static void acm_tty_set_termios(struct tty_struct *tty, struct ktermios *termios_old) { struct acm *acm = tty->driver_data; struct ktermios *termios = &tty->termios; struct usb_cdc_line_coding newline; int newctrl = acm->ctrlout; newline.dwDTERate = cpu_to_le32(tty_get_baud_rate(tty)); newline.bCharFormat = termios->c_cflag & CSTOPB ? 2 : 0; newline.bParityType = termios->c_cflag & PARENB ? (termios->c_cflag & PARODD ? 1 : 2) + (termios->c_cflag & CMSPAR ? 2 : 0) : 0; newline.bDataBits = tty_get_char_size(termios->c_cflag); /* FIXME: Needs to clear unsupported bits in the termios */ acm->clocal = ((termios->c_cflag & CLOCAL) != 0); if (C_BAUD(tty) == B0) { newline.dwDTERate = acm->line.dwDTERate; newctrl &= ~ACM_CTRL_DTR; } else if (termios_old && (termios_old->c_cflag & CBAUD) == B0) { newctrl |= ACM_CTRL_DTR; } if (newctrl != acm->ctrlout) acm_set_control(acm, acm->ctrlout = newctrl); if (memcmp(&acm->line, &newline, sizeof newline)) { memcpy(&acm->line, &newline, sizeof newline); dev_dbg(&acm->control->dev, "%s - set line: %d %d %d %d\n", __func__, le32_to_cpu(newline.dwDTERate), newline.bCharFormat, newline.bParityType, newline.bDataBits); acm_set_line(acm, &acm->line); } } static const struct tty_port_operations acm_port_ops = { .dtr_rts = acm_port_dtr_rts, .shutdown = acm_port_shutdown, .activate = acm_port_activate, .destruct = acm_port_destruct, }; /* * USB probe and disconnect routines. */ /* Little helpers: write/read buffers free */ static void acm_write_buffers_free(struct acm *acm) { int i; struct acm_wb *wb; for (wb = &acm->wb[0], i = 0; i < ACM_NW; i++, wb++) usb_free_coherent(acm->dev, acm->writesize, wb->buf, wb->dmah); } static void acm_read_buffers_free(struct acm *acm) { int i; for (i = 0; i < acm->rx_buflimit; i++) usb_free_coherent(acm->dev, acm->readsize, acm->read_buffers[i].base, acm->read_buffers[i].dma); } /* Little helper: write buffers allocate */ static int acm_write_buffers_alloc(struct acm *acm) { int i; struct acm_wb *wb; for (wb = &acm->wb[0], i = 0; i < ACM_NW; i++, wb++) { wb->buf = usb_alloc_coherent(acm->dev, acm->writesize, GFP_KERNEL, &wb->dmah); if (!wb->buf) { while (i != 0) { --i; --wb; usb_free_coherent(acm->dev, acm->writesize, wb->buf, wb->dmah); } return -ENOMEM; } } return 0; } static int acm_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_cdc_union_desc *union_header = NULL; struct usb_cdc_call_mgmt_descriptor *cmgmd = NULL; unsigned char *buffer = intf->altsetting->extra; int buflen = intf->altsetting->extralen; struct usb_interface *control_interface; struct usb_interface *data_interface; struct usb_endpoint_descriptor *epctrl = NULL; struct usb_endpoint_descriptor *epread = NULL; struct usb_endpoint_descriptor *epwrite = NULL; struct usb_device *usb_dev = interface_to_usbdev(intf); struct usb_cdc_parsed_header h; struct acm *acm; int minor; int ctrlsize, readsize; u8 *buf; int call_intf_num = -1; int data_intf_num = -1; unsigned long quirks; int num_rx_buf; int i; int combined_interfaces = 0; struct device *tty_dev; int rv = -ENOMEM; int res; /* normal quirks */ quirks = (unsigned long)id->driver_info; if (quirks == IGNORE_DEVICE) return -ENODEV; memset(&h, 0x00, sizeof(struct usb_cdc_parsed_header)); num_rx_buf = (quirks == SINGLE_RX_URB) ? 1 : ACM_NR; /* handle quirks deadly to normal probing*/ if (quirks == NO_UNION_NORMAL) { data_interface = usb_ifnum_to_if(usb_dev, 1); control_interface = usb_ifnum_to_if(usb_dev, 0); /* we would crash */ if (!data_interface || !control_interface) return -ENODEV; goto skip_normal_probe; } /* normal probing*/ if (!buffer) { dev_err(&intf->dev, "Weird descriptor references\n"); return -EINVAL; } if (!buflen) { if (intf->cur_altsetting->endpoint && intf->cur_altsetting->endpoint->extralen && intf->cur_altsetting->endpoint->extra) { dev_dbg(&intf->dev, "Seeking extra descriptors on endpoint\n"); buflen = intf->cur_altsetting->endpoint->extralen; buffer = intf->cur_altsetting->endpoint->extra; } else { dev_err(&intf->dev, "Zero length descriptor references\n"); return -EINVAL; } } cdc_parse_cdc_header(&h, intf, buffer, buflen); union_header = h.usb_cdc_union_desc; cmgmd = h.usb_cdc_call_mgmt_descriptor; if (cmgmd) call_intf_num = cmgmd->bDataInterface; if (!union_header) { if (intf->cur_altsetting->desc.bNumEndpoints == 3) { dev_dbg(&intf->dev, "No union descriptor, assuming single interface\n"); combined_interfaces = 1; control_interface = data_interface = intf; goto look_for_collapsed_interface; } else if (call_intf_num > 0) { dev_dbg(&intf->dev, "No union descriptor, using call management descriptor\n"); data_intf_num = call_intf_num; data_interface = usb_ifnum_to_if(usb_dev, data_intf_num); control_interface = intf; } else { dev_dbg(&intf->dev, "No union descriptor, giving up\n"); return -ENODEV; } } else { int class = -1; data_intf_num = union_header->bSlaveInterface0; control_interface = usb_ifnum_to_if(usb_dev, union_header->bMasterInterface0); data_interface = usb_ifnum_to_if(usb_dev, data_intf_num); if (control_interface) class = control_interface->cur_altsetting->desc.bInterfaceClass; if (class != USB_CLASS_COMM && class != USB_CLASS_CDC_DATA) { dev_dbg(&intf->dev, "Broken union descriptor, assuming single interface\n"); combined_interfaces = 1; control_interface = data_interface = intf; goto look_for_collapsed_interface; } } if (!control_interface || !data_interface) { dev_dbg(&intf->dev, "no interfaces\n"); return -ENODEV; } if (data_intf_num != call_intf_num) dev_dbg(&intf->dev, "Separate call control interface. That is not fully supported.\n"); if (control_interface == data_interface) { /* some broken devices designed for windows work this way */ dev_warn(&intf->dev,"Control and data interfaces are not separated!\n"); combined_interfaces = 1; /* a popular other OS doesn't use it */ quirks |= NO_CAP_LINE; if (data_interface->cur_altsetting->desc.bNumEndpoints != 3) { dev_err(&intf->dev, "This needs exactly 3 endpoints\n"); return -EINVAL; } look_for_collapsed_interface: res = usb_find_common_endpoints(data_interface->cur_altsetting, &epread, &epwrite, &epctrl, NULL); if (res) return res; goto made_compressed_probe; } skip_normal_probe: /*workaround for switched interfaces */ if (data_interface->cur_altsetting->desc.bInterfaceClass != USB_CLASS_CDC_DATA) { if (control_interface->cur_altsetting->desc.bInterfaceClass == USB_CLASS_CDC_DATA) { dev_dbg(&intf->dev, "Your device has switched interfaces.\n"); swap(control_interface, data_interface); } else { return -EINVAL; } } /* Accept probe requests only for the control interface */ if (!combined_interfaces && intf != control_interface) return -ENODEV; if (data_interface->cur_altsetting->desc.bNumEndpoints < 2 || control_interface->cur_altsetting->desc.bNumEndpoints == 0) return -EINVAL; epctrl = &control_interface->cur_altsetting->endpoint[0].desc; epread = &data_interface->cur_altsetting->endpoint[0].desc; epwrite = &data_interface->cur_altsetting->endpoint[1].desc; /* workaround for switched endpoints */ if (!usb_endpoint_dir_in(epread)) { /* descriptors are swapped */ dev_dbg(&intf->dev, "The data interface has switched endpoints\n"); swap(epread, epwrite); } made_compressed_probe: dev_dbg(&intf->dev, "interfaces are valid\n"); acm = kzalloc(sizeof(struct acm), GFP_KERNEL); if (!acm) return -ENOMEM; tty_port_init(&acm->port); acm->port.ops = &acm_port_ops; ctrlsize = usb_endpoint_maxp(epctrl); readsize = usb_endpoint_maxp(epread) * (quirks == SINGLE_RX_URB ? 1 : 2); acm->combined_interfaces = combined_interfaces; acm->writesize = usb_endpoint_maxp(epwrite) * 20; acm->control = control_interface; acm->data = data_interface; usb_get_intf(acm->control); /* undone in destruct() */ minor = acm_alloc_minor(acm); if (minor < 0) { acm->minor = ACM_MINOR_INVALID; goto err_put_port; } acm->minor = minor; acm->dev = usb_dev; if (h.usb_cdc_acm_descriptor) acm->ctrl_caps = h.usb_cdc_acm_descriptor->bmCapabilities; if (quirks & NO_CAP_LINE) acm->ctrl_caps &= ~USB_CDC_CAP_LINE; acm->ctrlsize = ctrlsize; acm->readsize = readsize; acm->rx_buflimit = num_rx_buf; INIT_DELAYED_WORK(&acm->dwork, acm_softint); init_waitqueue_head(&acm->wioctl); spin_lock_init(&acm->write_lock); spin_lock_init(&acm->read_lock); mutex_init(&acm->mutex); if (usb_endpoint_xfer_int(epread)) { acm->bInterval = epread->bInterval; acm->in = usb_rcvintpipe(usb_dev, epread->bEndpointAddress); } else { acm->in = usb_rcvbulkpipe(usb_dev, epread->bEndpointAddress); } if (usb_endpoint_xfer_int(epwrite)) acm->out = usb_sndintpipe(usb_dev, epwrite->bEndpointAddress); else acm->out = usb_sndbulkpipe(usb_dev, epwrite->bEndpointAddress); init_usb_anchor(&acm->delayed); acm->quirks = quirks; buf = usb_alloc_coherent(usb_dev, ctrlsize, GFP_KERNEL, &acm->ctrl_dma); if (!buf) goto err_put_port; acm->ctrl_buffer = buf; if (acm_write_buffers_alloc(acm) < 0) goto err_free_ctrl_buffer; acm->ctrlurb = usb_alloc_urb(0, GFP_KERNEL); if (!acm->ctrlurb) goto err_free_write_buffers; for (i = 0; i < num_rx_buf; i++) { struct acm_rb *rb = &(acm->read_buffers[i]); struct urb *urb; rb->base = usb_alloc_coherent(acm->dev, readsize, GFP_KERNEL, &rb->dma); if (!rb->base) goto err_free_read_urbs; rb->index = i; rb->instance = acm; urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) goto err_free_read_urbs; urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; urb->transfer_dma = rb->dma; if (usb_endpoint_xfer_int(epread)) usb_fill_int_urb(urb, acm->dev, acm->in, rb->base, acm->readsize, acm_read_bulk_callback, rb, acm->bInterval); else usb_fill_bulk_urb(urb, acm->dev, acm->in, rb->base, acm->readsize, acm_read_bulk_callback, rb); acm->read_urbs[i] = urb; __set_bit(i, &acm->read_urbs_free); } for (i = 0; i < ACM_NW; i++) { struct acm_wb *snd = &(acm->wb[i]); snd->urb = usb_alloc_urb(0, GFP_KERNEL); if (!snd->urb) goto err_free_write_urbs; if (usb_endpoint_xfer_int(epwrite)) usb_fill_int_urb(snd->urb, usb_dev, acm->out, NULL, acm->writesize, acm_write_bulk, snd, epwrite->bInterval); else usb_fill_bulk_urb(snd->urb, usb_dev, acm->out, NULL, acm->writesize, acm_write_bulk, snd); snd->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; if (quirks & SEND_ZERO_PACKET) snd->urb->transfer_flags |= URB_ZERO_PACKET; snd->instance = acm; } usb_set_intfdata(intf, acm); i = device_create_file(&intf->dev, &dev_attr_bmCapabilities); if (i < 0) goto err_free_write_urbs; if (h.usb_cdc_country_functional_desc) { /* export the country data */ struct usb_cdc_country_functional_desc * cfd = h.usb_cdc_country_functional_desc; acm->country_codes = kmalloc(cfd->bLength - 4, GFP_KERNEL); if (!acm->country_codes) goto skip_countries; acm->country_code_size = cfd->bLength - 4; memcpy(acm->country_codes, (u8 *)&cfd->wCountyCode0, cfd->bLength - 4); acm->country_rel_date = cfd->iCountryCodeRelDate; i = device_create_file(&intf->dev, &dev_attr_wCountryCodes); if (i < 0) { kfree(acm->country_codes); acm->country_codes = NULL; acm->country_code_size = 0; goto skip_countries; } i = device_create_file(&intf->dev, &dev_attr_iCountryCodeRelDate); if (i < 0) { device_remove_file(&intf->dev, &dev_attr_wCountryCodes); kfree(acm->country_codes); acm->country_codes = NULL; acm->country_code_size = 0; goto skip_countries; } } skip_countries: usb_fill_int_urb(acm->ctrlurb, usb_dev, usb_rcvintpipe(usb_dev, epctrl->bEndpointAddress), acm->ctrl_buffer, ctrlsize, acm_ctrl_irq, acm, /* works around buggy devices */ epctrl->bInterval ? epctrl->bInterval : 16); acm->ctrlurb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; acm->ctrlurb->transfer_dma = acm->ctrl_dma; acm->notification_buffer = NULL; acm->nb_index = 0; acm->nb_size = 0; acm->line.dwDTERate = cpu_to_le32(9600); acm->line.bDataBits = 8; acm_set_line(acm, &acm->line); if (!acm->combined_interfaces) { rv = usb_driver_claim_interface(&acm_driver, data_interface, acm); if (rv) goto err_remove_files; } if (quirks & CLEAR_HALT_CONDITIONS) { /* errors intentionally ignored */ usb_clear_halt(usb_dev, acm->in); usb_clear_halt(usb_dev, acm->out); } tty_dev = tty_port_register_device(&acm->port, acm_tty_driver, minor, &control_interface->dev); if (IS_ERR(tty_dev)) { rv = PTR_ERR(tty_dev); goto err_release_data_interface; } dev_info(&intf->dev, "ttyACM%d: USB ACM device\n", minor); return 0; err_release_data_interface: if (!acm->combined_interfaces) { /* Clear driver data so that disconnect() returns early. */ usb_set_intfdata(data_interface, NULL); usb_driver_release_interface(&acm_driver, data_interface); } err_remove_files: if (acm->country_codes) { device_remove_file(&acm->control->dev, &dev_attr_wCountryCodes); device_remove_file(&acm->control->dev, &dev_attr_iCountryCodeRelDate); } device_remove_file(&acm->control->dev, &dev_attr_bmCapabilities); err_free_write_urbs: for (i = 0; i < ACM_NW; i++) usb_free_urb(acm->wb[i].urb); err_free_read_urbs: for (i = 0; i < num_rx_buf; i++) usb_free_urb(acm->read_urbs[i]); acm_read_buffers_free(acm); usb_free_urb(acm->ctrlurb); err_free_write_buffers: acm_write_buffers_free(acm); err_free_ctrl_buffer: usb_free_coherent(usb_dev, ctrlsize, acm->ctrl_buffer, acm->ctrl_dma); err_put_port: tty_port_put(&acm->port); return rv; } static void acm_disconnect(struct usb_interface *intf) { struct acm *acm = usb_get_intfdata(intf); struct tty_struct *tty; int i; /* sibling interface is already cleaning up */ if (!acm) return; acm->disconnected = true; /* * there is a circular dependency. acm_softint() can resubmit * the URBs in error handling so we need to block any * submission right away */ acm_poison_urbs(acm); mutex_lock(&acm->mutex); if (acm->country_codes) { device_remove_file(&acm->control->dev, &dev_attr_wCountryCodes); device_remove_file(&acm->control->dev, &dev_attr_iCountryCodeRelDate); } wake_up_all(&acm->wioctl); device_remove_file(&acm->control->dev, &dev_attr_bmCapabilities); usb_set_intfdata(acm->control, NULL); usb_set_intfdata(acm->data, NULL); mutex_unlock(&acm->mutex); tty = tty_port_tty_get(&acm->port); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } cancel_delayed_work_sync(&acm->dwork); tty_unregister_device(acm_tty_driver, acm->minor); usb_free_urb(acm->ctrlurb); for (i = 0; i < ACM_NW; i++) usb_free_urb(acm->wb[i].urb); for (i = 0; i < acm->rx_buflimit; i++) usb_free_urb(acm->read_urbs[i]); acm_write_buffers_free(acm); usb_free_coherent(acm->dev, acm->ctrlsize, acm->ctrl_buffer, acm->ctrl_dma); acm_read_buffers_free(acm); kfree(acm->notification_buffer); if (!acm->combined_interfaces) usb_driver_release_interface(&acm_driver, intf == acm->control ? acm->data : acm->control); tty_port_put(&acm->port); } #ifdef CONFIG_PM static int acm_suspend(struct usb_interface *intf, pm_message_t message) { struct acm *acm = usb_get_intfdata(intf); int cnt; spin_lock_irq(&acm->write_lock); if (PMSG_IS_AUTO(message)) { if (acm->transmitting) { spin_unlock_irq(&acm->write_lock); return -EBUSY; } } cnt = acm->susp_count++; spin_unlock_irq(&acm->write_lock); if (cnt) return 0; acm_poison_urbs(acm); cancel_delayed_work_sync(&acm->dwork); acm->urbs_in_error_delay = 0; return 0; } static int acm_resume(struct usb_interface *intf) { struct acm *acm = usb_get_intfdata(intf); struct urb *urb; int rv = 0; spin_lock_irq(&acm->write_lock); if (--acm->susp_count) goto out; acm_unpoison_urbs(acm); if (tty_port_initialized(&acm->port)) { rv = usb_submit_urb(acm->ctrlurb, GFP_ATOMIC); for (;;) { urb = usb_get_from_anchor(&acm->delayed); if (!urb) break; acm_start_wb(acm, urb->context); } /* * delayed error checking because we must * do the write path at all cost */ if (rv < 0) goto out; rv = acm_submit_read_urbs(acm, GFP_ATOMIC); } out: spin_unlock_irq(&acm->write_lock); return rv; } static int acm_reset_resume(struct usb_interface *intf) { struct acm *acm = usb_get_intfdata(intf); if (tty_port_initialized(&acm->port)) tty_port_tty_hangup(&acm->port, false); return acm_resume(intf); } #endif /* CONFIG_PM */ static int acm_pre_reset(struct usb_interface *intf) { struct acm *acm = usb_get_intfdata(intf); clear_bit(EVENT_RX_STALL, &acm->flags); acm->nb_index = 0; /* pending control transfers are lost */ return 0; } #define NOKIA_PCSUITE_ACM_INFO(x) \ USB_DEVICE_AND_INTERFACE_INFO(0x0421, x, \ USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, \ USB_CDC_ACM_PROTO_VENDOR) #define SAMSUNG_PCSUITE_ACM_INFO(x) \ USB_DEVICE_AND_INTERFACE_INFO(0x04e7, x, \ USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, \ USB_CDC_ACM_PROTO_VENDOR) /* * USB driver structure. */ static const struct usb_device_id acm_ids[] = { /* quirky and broken devices */ { USB_DEVICE(0x0424, 0x274e), /* Microchip Technology, Inc. (formerly SMSC) */ .driver_info = DISABLE_ECHO, }, /* DISABLE ECHO in termios flag */ { USB_DEVICE(0x076d, 0x0006), /* Denso Cradle CU-321 */ .driver_info = NO_UNION_NORMAL, },/* has no union descriptor */ { USB_DEVICE(0x17ef, 0x7000), /* Lenovo USB modem */ .driver_info = NO_UNION_NORMAL, },/* has no union descriptor */ { USB_DEVICE(0x0870, 0x0001), /* Metricom GS Modem */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x045b, 0x023c), /* Renesas R-Car H3 USB Download mode */ .driver_info = DISABLE_ECHO, /* Don't echo banner */ }, { USB_DEVICE(0x045b, 0x0247), /* Renesas R-Car D3 USB Download mode */ .driver_info = DISABLE_ECHO, /* Don't echo banner */ }, { USB_DEVICE(0x045b, 0x0248), /* Renesas R-Car M3-N USB Download mode */ .driver_info = DISABLE_ECHO, /* Don't echo banner */ }, { USB_DEVICE(0x045b, 0x024D), /* Renesas R-Car E3 USB Download mode */ .driver_info = DISABLE_ECHO, /* Don't echo banner */ }, { USB_DEVICE(0x0e8d, 0x0003), /* FIREFLY, MediaTek Inc; andrey.arapov@gmail.com */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x0e8d, 0x2000), /* MediaTek Inc Preloader */ .driver_info = DISABLE_ECHO, /* DISABLE ECHO in termios flag */ }, { USB_DEVICE(0x0e8d, 0x3329), /* MediaTek Inc GPS */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x0482, 0x0203), /* KYOCERA AH-K3001V */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x079b, 0x000f), /* BT On-Air USB MODEM */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x0ace, 0x1602), /* ZyDAS 56K USB MODEM */ .driver_info = SINGLE_RX_URB, }, { USB_DEVICE(0x0ace, 0x1608), /* ZyDAS 56K USB MODEM */ .driver_info = SINGLE_RX_URB, /* firmware bug */ }, { USB_DEVICE(0x0ace, 0x1611), /* ZyDAS 56K USB MODEM - new version */ .driver_info = SINGLE_RX_URB, /* firmware bug */ }, { USB_DEVICE(0x11ca, 0x0201), /* VeriFone Mx870 Gadget Serial */ .driver_info = SINGLE_RX_URB, }, { USB_DEVICE(0x1901, 0x0006), /* GE Healthcare Patient Monitor UI Controller */ .driver_info = DISABLE_ECHO, /* DISABLE ECHO in termios flag */ }, { USB_DEVICE(0x1965, 0x0018), /* Uniden UBC125XLT */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x22b8, 0x7000), /* Motorola Q Phone */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x0803, 0x3095), /* Zoom Telephonics Model 3095F USB MODEM */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x0572, 0x1321), /* Conexant USB MODEM CX93010 */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x0572, 0x1324), /* Conexant USB MODEM RD02-D400 */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x0572, 0x1328), /* Shiro / Aztech USB MODEM UM-3100 */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x0572, 0x1349), /* Hiro (Conexant) USB MODEM H50228 */ .driver_info = NO_UNION_NORMAL, /* has no union descriptor */ }, { USB_DEVICE(0x20df, 0x0001), /* Simtec Electronics Entropy Key */ .driver_info = QUIRK_CONTROL_LINE_STATE, }, { USB_DEVICE(0x2184, 0x001c) }, /* GW Instek AFG-2225 */ { USB_DEVICE(0x2184, 0x0036) }, /* GW Instek AFG-125 */ { USB_DEVICE(0x22b8, 0x6425), /* Motorola MOTOMAGX phones */ }, /* Motorola H24 HSPA module: */ { USB_DEVICE(0x22b8, 0x2d91) }, /* modem */ { USB_DEVICE(0x22b8, 0x2d92), /* modem + diagnostics */ .driver_info = NO_UNION_NORMAL, /* handle only modem interface */ }, { USB_DEVICE(0x22b8, 0x2d93), /* modem + AT port */ .driver_info = NO_UNION_NORMAL, /* handle only modem interface */ }, { USB_DEVICE(0x22b8, 0x2d95), /* modem + AT port + diagnostics */ .driver_info = NO_UNION_NORMAL, /* handle only modem interface */ }, { USB_DEVICE(0x22b8, 0x2d96), /* modem + NMEA */ .driver_info = NO_UNION_NORMAL, /* handle only modem interface */ }, { USB_DEVICE(0x22b8, 0x2d97), /* modem + diagnostics + NMEA */ .driver_info = NO_UNION_NORMAL, /* handle only modem interface */ }, { USB_DEVICE(0x22b8, 0x2d99), /* modem + AT port + NMEA */ .driver_info = NO_UNION_NORMAL, /* handle only modem interface */ }, { USB_DEVICE(0x22b8, 0x2d9a), /* modem + AT port + diagnostics + NMEA */ .driver_info = NO_UNION_NORMAL, /* handle only modem interface */ }, { USB_DEVICE(0x0572, 0x1329), /* Hummingbird huc56s (Conexant) */ .driver_info = NO_UNION_NORMAL, /* union descriptor misplaced on data interface instead of communications interface. Maybe we should define a new quirk for this. */ }, { USB_DEVICE(0x0572, 0x1340), /* Conexant CX93010-2x UCMxx */ .driver_info = NO_UNION_NORMAL, }, { USB_DEVICE(0x05f9, 0x4002), /* PSC Scanning, Magellan 800i */ .driver_info = NO_UNION_NORMAL, }, { USB_DEVICE(0x1bbb, 0x0003), /* Alcatel OT-I650 */ .driver_info = NO_UNION_NORMAL, /* reports zero length descriptor */ }, { USB_DEVICE(0x1576, 0x03b1), /* Maretron USB100 */ .driver_info = NO_UNION_NORMAL, /* reports zero length descriptor */ }, { USB_DEVICE(0xfff0, 0x0100), /* DATECS FP-2000 */ .driver_info = NO_UNION_NORMAL, /* reports zero length descriptor */ }, { USB_DEVICE(0x09d8, 0x0320), /* Elatec GmbH TWN3 */ .driver_info = NO_UNION_NORMAL, /* has misplaced union descriptor */ }, { USB_DEVICE(0x0c26, 0x0020), /* Icom ICF3400 Serie */ .driver_info = NO_UNION_NORMAL, /* reports zero length descriptor */ }, { USB_DEVICE(0x0ca6, 0xa050), /* Castles VEGA3000 */ .driver_info = NO_UNION_NORMAL, /* reports zero length descriptor */ }, { USB_DEVICE(0x2912, 0x0001), /* ATOL FPrint */ .driver_info = CLEAR_HALT_CONDITIONS, }, /* Nokia S60 phones expose two ACM channels. The first is * a modem and is picked up by the standard AT-command * information below. The second is 'vendor-specific' but * is treated as a serial device at the S60 end, so we want * to expose it on Linux too. */ { NOKIA_PCSUITE_ACM_INFO(0x042D), }, /* Nokia 3250 */ { NOKIA_PCSUITE_ACM_INFO(0x04D8), }, /* Nokia 5500 Sport */ { NOKIA_PCSUITE_ACM_INFO(0x04C9), }, /* Nokia E50 */ { NOKIA_PCSUITE_ACM_INFO(0x0419), }, /* Nokia E60 */ { NOKIA_PCSUITE_ACM_INFO(0x044D), }, /* Nokia E61 */ { NOKIA_PCSUITE_ACM_INFO(0x0001), }, /* Nokia E61i */ { NOKIA_PCSUITE_ACM_INFO(0x0475), }, /* Nokia E62 */ { NOKIA_PCSUITE_ACM_INFO(0x0508), }, /* Nokia E65 */ { NOKIA_PCSUITE_ACM_INFO(0x0418), }, /* Nokia E70 */ { NOKIA_PCSUITE_ACM_INFO(0x0425), }, /* Nokia N71 */ { NOKIA_PCSUITE_ACM_INFO(0x0486), }, /* Nokia N73 */ { NOKIA_PCSUITE_ACM_INFO(0x04DF), }, /* Nokia N75 */ { NOKIA_PCSUITE_ACM_INFO(0x000e), }, /* Nokia N77 */ { NOKIA_PCSUITE_ACM_INFO(0x0445), }, /* Nokia N80 */ { NOKIA_PCSUITE_ACM_INFO(0x042F), }, /* Nokia N91 & N91 8GB */ { NOKIA_PCSUITE_ACM_INFO(0x048E), }, /* Nokia N92 */ { NOKIA_PCSUITE_ACM_INFO(0x0420), }, /* Nokia N93 */ { NOKIA_PCSUITE_ACM_INFO(0x04E6), }, /* Nokia N93i */ { NOKIA_PCSUITE_ACM_INFO(0x04B2), }, /* Nokia 5700 XpressMusic */ { NOKIA_PCSUITE_ACM_INFO(0x0134), }, /* Nokia 6110 Navigator (China) */ { NOKIA_PCSUITE_ACM_INFO(0x046E), }, /* Nokia 6110 Navigator */ { NOKIA_PCSUITE_ACM_INFO(0x002f), }, /* Nokia 6120 classic & */ { NOKIA_PCSUITE_ACM_INFO(0x0088), }, /* Nokia 6121 classic */ { NOKIA_PCSUITE_ACM_INFO(0x00fc), }, /* Nokia 6124 classic */ { NOKIA_PCSUITE_ACM_INFO(0x0042), }, /* Nokia E51 */ { NOKIA_PCSUITE_ACM_INFO(0x00b0), }, /* Nokia E66 */ { NOKIA_PCSUITE_ACM_INFO(0x00ab), }, /* Nokia E71 */ { NOKIA_PCSUITE_ACM_INFO(0x0481), }, /* Nokia N76 */ { NOKIA_PCSUITE_ACM_INFO(0x0007), }, /* Nokia N81 & N81 8GB */ { NOKIA_PCSUITE_ACM_INFO(0x0071), }, /* Nokia N82 */ { NOKIA_PCSUITE_ACM_INFO(0x04F0), }, /* Nokia N95 & N95-3 NAM */ { NOKIA_PCSUITE_ACM_INFO(0x0070), }, /* Nokia N95 8GB */ { NOKIA_PCSUITE_ACM_INFO(0x00e9), }, /* Nokia 5320 XpressMusic */ { NOKIA_PCSUITE_ACM_INFO(0x0099), }, /* Nokia 6210 Navigator, RM-367 */ { NOKIA_PCSUITE_ACM_INFO(0x0128), }, /* Nokia 6210 Navigator, RM-419 */ { NOKIA_PCSUITE_ACM_INFO(0x008f), }, /* Nokia 6220 Classic */ { NOKIA_PCSUITE_ACM_INFO(0x00a0), }, /* Nokia 6650 */ { NOKIA_PCSUITE_ACM_INFO(0x007b), }, /* Nokia N78 */ { NOKIA_PCSUITE_ACM_INFO(0x0094), }, /* Nokia N85 */ { NOKIA_PCSUITE_ACM_INFO(0x003a), }, /* Nokia N96 & N96-3 */ { NOKIA_PCSUITE_ACM_INFO(0x00e9), }, /* Nokia 5320 XpressMusic */ { NOKIA_PCSUITE_ACM_INFO(0x0108), }, /* Nokia 5320 XpressMusic 2G */ { NOKIA_PCSUITE_ACM_INFO(0x01f5), }, /* Nokia N97, RM-505 */ { NOKIA_PCSUITE_ACM_INFO(0x02e3), }, /* Nokia 5230, RM-588 */ { NOKIA_PCSUITE_ACM_INFO(0x0178), }, /* Nokia E63 */ { NOKIA_PCSUITE_ACM_INFO(0x010e), }, /* Nokia E75 */ { NOKIA_PCSUITE_ACM_INFO(0x02d9), }, /* Nokia 6760 Slide */ { NOKIA_PCSUITE_ACM_INFO(0x01d0), }, /* Nokia E52 */ { NOKIA_PCSUITE_ACM_INFO(0x0223), }, /* Nokia E72 */ { NOKIA_PCSUITE_ACM_INFO(0x0275), }, /* Nokia X6 */ { NOKIA_PCSUITE_ACM_INFO(0x026c), }, /* Nokia N97 Mini */ { NOKIA_PCSUITE_ACM_INFO(0x0154), }, /* Nokia 5800 XpressMusic */ { NOKIA_PCSUITE_ACM_INFO(0x04ce), }, /* Nokia E90 */ { NOKIA_PCSUITE_ACM_INFO(0x01d4), }, /* Nokia E55 */ { NOKIA_PCSUITE_ACM_INFO(0x0302), }, /* Nokia N8 */ { NOKIA_PCSUITE_ACM_INFO(0x0335), }, /* Nokia E7 */ { NOKIA_PCSUITE_ACM_INFO(0x03cd), }, /* Nokia C7 */ { SAMSUNG_PCSUITE_ACM_INFO(0x6651), }, /* Samsung GTi8510 (INNOV8) */ /* Support for Owen devices */ { USB_DEVICE(0x03eb, 0x0030), }, /* Owen SI30 */ /* NOTE: non-Nokia COMM/ACM/0xff is likely MSFT RNDIS... NOT a modem! */ #if IS_ENABLED(CONFIG_INPUT_IMS_PCU) { USB_DEVICE(0x04d8, 0x0082), /* Application mode */ .driver_info = IGNORE_DEVICE, }, { USB_DEVICE(0x04d8, 0x0083), /* Bootloader mode */ .driver_info = IGNORE_DEVICE, }, #endif #if IS_ENABLED(CONFIG_IR_TOY) { USB_DEVICE(0x04d8, 0xfd08), .driver_info = IGNORE_DEVICE, }, { USB_DEVICE(0x04d8, 0xf58b), .driver_info = IGNORE_DEVICE, }, #endif #if IS_ENABLED(CONFIG_USB_SERIAL_XR) { USB_DEVICE(0x04e2, 0x1400), .driver_info = IGNORE_DEVICE }, { USB_DEVICE(0x04e2, 0x1401), .driver_info = IGNORE_DEVICE }, { USB_DEVICE(0x04e2, 0x1402), .driver_info = IGNORE_DEVICE }, { USB_DEVICE(0x04e2, 0x1403), .driver_info = IGNORE_DEVICE }, { USB_DEVICE(0x04e2, 0x1410), .driver_info = IGNORE_DEVICE }, { USB_DEVICE(0x04e2, 0x1411), .driver_info = IGNORE_DEVICE }, { USB_DEVICE(0x04e2, 0x1412), .driver_info = IGNORE_DEVICE }, { USB_DEVICE(0x04e2, 0x1414), .driver_info = IGNORE_DEVICE }, { USB_DEVICE(0x04e2, 0x1420), .driver_info = IGNORE_DEVICE }, { USB_DEVICE(0x04e2, 0x1422), .driver_info = IGNORE_DEVICE }, { USB_DEVICE(0x04e2, 0x1424), .driver_info = IGNORE_DEVICE }, #endif /*Samsung phone in firmware update mode */ { USB_DEVICE(0x04e8, 0x685d), .driver_info = IGNORE_DEVICE, }, /* Exclude Infineon Flash Loader utility */ { USB_DEVICE(0x058b, 0x0041), .driver_info = IGNORE_DEVICE, }, /* Exclude ETAS ES58x */ { USB_DEVICE(0x108c, 0x0159), /* ES581.4 */ .driver_info = IGNORE_DEVICE, }, { USB_DEVICE(0x108c, 0x0168), /* ES582.1 */ .driver_info = IGNORE_DEVICE, }, { USB_DEVICE(0x108c, 0x0169), /* ES584.1 */ .driver_info = IGNORE_DEVICE, }, { USB_DEVICE(0x1bc7, 0x0021), /* Telit 3G ACM only composition */ .driver_info = SEND_ZERO_PACKET, }, { USB_DEVICE(0x1bc7, 0x0023), /* Telit 3G ACM + ECM composition */ .driver_info = SEND_ZERO_PACKET, }, /* Exclude Goodix Fingerprint Reader */ { USB_DEVICE(0x27c6, 0x5395), .driver_info = IGNORE_DEVICE, }, /* Exclude Heimann Sensor GmbH USB appset demo */ { USB_DEVICE(0x32a7, 0x0000), .driver_info = IGNORE_DEVICE, }, /* control interfaces without any protocol set */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, USB_CDC_PROTO_NONE) }, /* control interfaces with various AT-command sets */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, USB_CDC_ACM_PROTO_AT_V25TER) }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, USB_CDC_ACM_PROTO_AT_PCCA101) }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, USB_CDC_ACM_PROTO_AT_PCCA101_WAKE) }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, USB_CDC_ACM_PROTO_AT_GSM) }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, USB_CDC_ACM_PROTO_AT_3G) }, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM, USB_CDC_ACM_PROTO_AT_CDMA) }, { USB_DEVICE(0x1519, 0x0452), /* Intel 7260 modem */ .driver_info = SEND_ZERO_PACKET, }, { } }; MODULE_DEVICE_TABLE(usb, acm_ids); static struct usb_driver acm_driver = { .name = "cdc_acm", .probe = acm_probe, .disconnect = acm_disconnect, #ifdef CONFIG_PM .suspend = acm_suspend, .resume = acm_resume, .reset_resume = acm_reset_resume, #endif .pre_reset = acm_pre_reset, .id_table = acm_ids, #ifdef CONFIG_PM .supports_autosuspend = 1, #endif .disable_hub_initiated_lpm = 1, }; /* * TTY driver structures. */ static const struct tty_operations acm_ops = { .install = acm_tty_install, .open = acm_tty_open, .close = acm_tty_close, .cleanup = acm_tty_cleanup, .hangup = acm_tty_hangup, .write = acm_tty_write, .write_room = acm_tty_write_room, .ioctl = acm_tty_ioctl, .throttle = acm_tty_throttle, .unthrottle = acm_tty_unthrottle, .chars_in_buffer = acm_tty_chars_in_buffer, .break_ctl = acm_tty_break_ctl, .set_termios = acm_tty_set_termios, .tiocmget = acm_tty_tiocmget, .tiocmset = acm_tty_tiocmset, .get_serial = get_serial_info, .set_serial = set_serial_info, .get_icount = acm_tty_get_icount, }; /* * Init / exit. */ static int __init acm_init(void) { int retval; acm_tty_driver = tty_alloc_driver(ACM_TTY_MINORS, TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV); if (IS_ERR(acm_tty_driver)) return PTR_ERR(acm_tty_driver); acm_tty_driver->driver_name = "acm", acm_tty_driver->name = "ttyACM", acm_tty_driver->major = ACM_TTY_MAJOR, acm_tty_driver->minor_start = 0, acm_tty_driver->type = TTY_DRIVER_TYPE_SERIAL, acm_tty_driver->subtype = SERIAL_TYPE_NORMAL, acm_tty_driver->init_termios = tty_std_termios; acm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; tty_set_operations(acm_tty_driver, &acm_ops); retval = tty_register_driver(acm_tty_driver); if (retval) { tty_driver_kref_put(acm_tty_driver); return retval; } retval = usb_register(&acm_driver); if (retval) { tty_unregister_driver(acm_tty_driver); tty_driver_kref_put(acm_tty_driver); return retval; } printk(KERN_INFO KBUILD_MODNAME ": " DRIVER_DESC "\n"); return 0; } static void __exit acm_exit(void) { usb_deregister(&acm_driver); tty_unregister_driver(acm_tty_driver); tty_driver_kref_put(acm_tty_driver); idr_destroy(&acm_minors); } module_init(acm_init); module_exit(acm_exit); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); MODULE_ALIAS_CHARDEV_MAJOR(ACM_TTY_MAJOR);
1 1 8 7 5 5 51 55 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _CCID_H #define _CCID_H /* * net/dccp/ccid.h * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * CCID infrastructure */ #include <net/sock.h> #include <linux/compiler.h> #include <linux/dccp.h> #include <linux/list.h> #include <linux/module.h> /* maximum value for a CCID (RFC 4340, 19.5) */ #define CCID_MAX 255 #define CCID_SLAB_NAME_LENGTH 32 struct tcp_info; /** * struct ccid_operations - Interface to Congestion-Control Infrastructure * * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) * @ccid_name: alphabetical identifier string for @ccid_id * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket * * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) * @ccid_hc_rx_packet_recv: implements the HC-receiver side * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender */ struct ccid_operations { unsigned char ccid_id; __u32 ccid_ccmps; const char *ccid_name; struct kmem_cache *ccid_hc_rx_slab, *ccid_hc_tx_slab; char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH]; char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH]; __u32 ccid_hc_rx_obj_size, ccid_hc_tx_obj_size; /* Interface Routines */ int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); void (*ccid_hc_rx_exit)(struct sock *sk); void (*ccid_hc_tx_exit)(struct sock *sk); void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len); int (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len); int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, unsigned int len); void (*ccid_hc_rx_get_info)(struct sock *sk, struct tcp_info *info); void (*ccid_hc_tx_get_info)(struct sock *sk, struct tcp_info *info); int (*ccid_hc_rx_getsockopt)(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen); int (*ccid_hc_tx_getsockopt)(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen); }; extern struct ccid_operations ccid2_ops; #ifdef CONFIG_IP_DCCP_CCID3 extern struct ccid_operations ccid3_ops; #endif int ccid_initialize_builtins(void); void ccid_cleanup_builtins(void); struct ccid { struct ccid_operations *ccid_ops; char ccid_priv[]; }; static inline void *ccid_priv(const struct ccid *ccid) { return (void *)ccid->ccid_priv; } bool ccid_support_check(u8 const *ccid_array, u8 array_len); int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, char __user *, int __user *); struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx); static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) { struct ccid *ccid = dp->dccps_hc_rx_ccid; if (ccid == NULL || ccid->ccid_ops == NULL) return -1; return ccid->ccid_ops->ccid_id; } static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) { struct ccid *ccid = dp->dccps_hc_tx_ccid; if (ccid == NULL || ccid->ccid_ops == NULL) return -1; return ccid->ccid_ops->ccid_id; } void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); /* * Congestion control of queued data packets via CCID decision. * * The TX CCID performs its congestion-control by indicating whether and when a * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). * The following modes are supported via the symbolic constants below: * - timer-based pacing (CCID returns a delay value in milliseconds); * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). */ enum ccid_dequeueing_decision { CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ CCID_PACKET_ERR = 0xF0000, /* error condition */ }; static inline int ccid_packet_dequeue_eval(const int return_code) { if (return_code < 0) return CCID_PACKET_ERR; if (return_code == 0) return CCID_PACKET_SEND_AT_ONCE; if (return_code <= CCID_PACKET_DELAY_MAX) return CCID_PACKET_DELAY; return return_code; } static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); return CCID_PACKET_SEND_AT_ONCE; } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, unsigned int len) { if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); } static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL) ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb); } static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL) ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); } /** * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) * @val: value of @opt * @len: length of @val in bytes */ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len) { if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options) return 0; return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); } /** * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender * Arguments are analogous to ccid_hc_tx_parse_options() */ static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len) { if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options) return 0; return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); } static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL) return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb); return 0; } static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, struct tcp_info *info) { if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL) ccid->ccid_ops->ccid_hc_rx_get_info(sk, info); } static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, struct tcp_info *info) { if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL) ccid->ccid_ops->ccid_hc_tx_get_info(sk, info); } static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, optval, optlen); return rc; } static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, optval, optlen); return rc; } #endif /* _CCID_H */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2001 Vojtech Pavlik * * CATC EL1210A NetMate USB Ethernet driver * * Sponsored by SuSE * * Based on the work of * Donald Becker * * Old chipset support added by Simon Evans <spse@secret.org.uk> 2002 * - adds support for Belkin F5U011 */ /* * * Should you need to contact me, the author, you can do so either by * e-mail - mail your message to <vojtech@suse.cz>, or by paper mail: * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/crc32.h> #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/uaccess.h> #undef DEBUG #include <linux/usb.h> /* * Version information. */ #define DRIVER_VERSION "v2.8" #define DRIVER_AUTHOR "Vojtech Pavlik <vojtech@suse.cz>" #define DRIVER_DESC "CATC EL1210A NetMate USB Ethernet driver" #define SHORT_DRIVER_DESC "EL1210A NetMate USB Ethernet" MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); static const char driver_name[] = "catc"; /* * Some defines. */ #define STATS_UPDATE (HZ) /* Time between stats updates */ #define TX_TIMEOUT (5*HZ) /* Max time the queue can be stopped */ #define PKT_SZ 1536 /* Max Ethernet packet size */ #define RX_MAX_BURST 15 /* Max packets per rx buffer (> 0, < 16) */ #define TX_MAX_BURST 15 /* Max full sized packets per tx buffer (> 0) */ #define CTRL_QUEUE 16 /* Max control requests in flight (power of two) */ #define RX_PKT_SZ 1600 /* Max size of receive packet for F5U011 */ /* * Control requests. */ enum control_requests { ReadMem = 0xf1, GetMac = 0xf2, Reset = 0xf4, SetMac = 0xf5, SetRxMode = 0xf5, /* F5U011 only */ WriteROM = 0xf8, SetReg = 0xfa, GetReg = 0xfb, WriteMem = 0xfc, ReadROM = 0xfd, }; /* * Registers. */ enum register_offsets { TxBufCount = 0x20, RxBufCount = 0x21, OpModes = 0x22, TxQed = 0x23, RxQed = 0x24, MaxBurst = 0x25, RxUnit = 0x60, EthStatus = 0x61, StationAddr0 = 0x67, EthStats = 0x69, LEDCtrl = 0x81, }; enum eth_stats { TxSingleColl = 0x00, TxMultiColl = 0x02, TxExcessColl = 0x04, RxFramErr = 0x06, }; enum op_mode_bits { Op3MemWaits = 0x03, OpLenInclude = 0x08, OpRxMerge = 0x10, OpTxMerge = 0x20, OpWin95bugfix = 0x40, OpLoopback = 0x80, }; enum rx_filter_bits { RxEnable = 0x01, RxPolarity = 0x02, RxForceOK = 0x04, RxMultiCast = 0x08, RxPromisc = 0x10, AltRxPromisc = 0x20, /* F5U011 uses different bit */ }; enum led_values { LEDFast = 0x01, LEDSlow = 0x02, LEDFlash = 0x03, LEDPulse = 0x04, LEDLink = 0x08, }; enum link_status { LinkNoChange = 0, LinkGood = 1, LinkBad = 2 }; /* * The catc struct. */ #define CTRL_RUNNING 0 #define RX_RUNNING 1 #define TX_RUNNING 2 struct catc { struct net_device *netdev; struct usb_device *usbdev; unsigned long flags; unsigned int tx_ptr, tx_idx; unsigned int ctrl_head, ctrl_tail; spinlock_t tx_lock, ctrl_lock; u8 tx_buf[2][TX_MAX_BURST * (PKT_SZ + 2)]; u8 rx_buf[RX_MAX_BURST * (PKT_SZ + 2)]; u8 irq_buf[2]; u8 ctrl_buf[64]; struct usb_ctrlrequest ctrl_dr; struct timer_list timer; u8 stats_buf[8]; u16 stats_vals[4]; unsigned long last_stats; u8 multicast[64]; struct ctrl_queue { u8 dir; u8 request; u16 value; u16 index; void *buf; int len; void (*callback)(struct catc *catc, struct ctrl_queue *q); } ctrl_queue[CTRL_QUEUE]; struct urb *tx_urb, *rx_urb, *irq_urb, *ctrl_urb; u8 is_f5u011; /* Set if device is an F5U011 */ u8 rxmode[2]; /* Used for F5U011 */ atomic_t recq_sz; /* Used for F5U011 - counter of waiting rx packets */ }; /* * Useful macros. */ #define catc_get_mac(catc, mac) catc_ctrl_msg(catc, USB_DIR_IN, GetMac, 0, 0, mac, 6) #define catc_reset(catc) catc_ctrl_msg(catc, USB_DIR_OUT, Reset, 0, 0, NULL, 0) #define catc_set_reg(catc, reg, val) catc_ctrl_msg(catc, USB_DIR_OUT, SetReg, val, reg, NULL, 0) #define catc_get_reg(catc, reg, buf) catc_ctrl_msg(catc, USB_DIR_IN, GetReg, 0, reg, buf, 1) #define catc_write_mem(catc, addr, buf, size) catc_ctrl_msg(catc, USB_DIR_OUT, WriteMem, 0, addr, buf, size) #define catc_read_mem(catc, addr, buf, size) catc_ctrl_msg(catc, USB_DIR_IN, ReadMem, 0, addr, buf, size) #define f5u011_rxmode(catc, rxmode) catc_ctrl_msg(catc, USB_DIR_OUT, SetRxMode, 0, 1, rxmode, 2) #define f5u011_rxmode_async(catc, rxmode) catc_ctrl_async(catc, USB_DIR_OUT, SetRxMode, 0, 1, &rxmode, 2, NULL) #define f5u011_mchash_async(catc, hash) catc_ctrl_async(catc, USB_DIR_OUT, SetRxMode, 0, 2, &hash, 8, NULL) #define catc_set_reg_async(catc, reg, val) catc_ctrl_async(catc, USB_DIR_OUT, SetReg, val, reg, NULL, 0, NULL) #define catc_get_reg_async(catc, reg, cb) catc_ctrl_async(catc, USB_DIR_IN, GetReg, 0, reg, NULL, 1, cb) #define catc_write_mem_async(catc, addr, buf, size) catc_ctrl_async(catc, USB_DIR_OUT, WriteMem, 0, addr, buf, size, NULL) /* * Receive routines. */ static void catc_rx_done(struct urb *urb) { struct catc *catc = urb->context; u8 *pkt_start = urb->transfer_buffer; struct sk_buff *skb; int pkt_len, pkt_offset = 0; int status = urb->status; if (!catc->is_f5u011) { clear_bit(RX_RUNNING, &catc->flags); pkt_offset = 2; } if (status) { dev_dbg(&urb->dev->dev, "rx_done, status %d, length %d\n", status, urb->actual_length); return; } do { if(!catc->is_f5u011) { pkt_len = le16_to_cpup((__le16*)pkt_start); if (pkt_len > urb->actual_length) { catc->netdev->stats.rx_length_errors++; catc->netdev->stats.rx_errors++; break; } } else { pkt_len = urb->actual_length; } if (!(skb = dev_alloc_skb(pkt_len))) return; skb_copy_to_linear_data(skb, pkt_start + pkt_offset, pkt_len); skb_put(skb, pkt_len); skb->protocol = eth_type_trans(skb, catc->netdev); netif_rx(skb); catc->netdev->stats.rx_packets++; catc->netdev->stats.rx_bytes += pkt_len; /* F5U011 only does one packet per RX */ if (catc->is_f5u011) break; pkt_start += (((pkt_len + 1) >> 6) + 1) << 6; } while (pkt_start - (u8 *) urb->transfer_buffer < urb->actual_length); if (catc->is_f5u011) { if (atomic_read(&catc->recq_sz)) { int state; atomic_dec(&catc->recq_sz); netdev_dbg(catc->netdev, "getting extra packet\n"); urb->dev = catc->usbdev; if ((state = usb_submit_urb(urb, GFP_ATOMIC)) < 0) { netdev_dbg(catc->netdev, "submit(rx_urb) status %d\n", state); } } else { clear_bit(RX_RUNNING, &catc->flags); } } } static void catc_irq_done(struct urb *urb) { struct catc *catc = urb->context; u8 *data = urb->transfer_buffer; int status = urb->status; unsigned int hasdata = 0, linksts = LinkNoChange; int res; if (!catc->is_f5u011) { hasdata = data[1] & 0x80; if (data[1] & 0x40) linksts = LinkGood; else if (data[1] & 0x20) linksts = LinkBad; } else { hasdata = (unsigned int)(be16_to_cpup((__be16*)data) & 0x0fff); if (data[0] == 0x90) linksts = LinkGood; else if (data[0] == 0xA0) linksts = LinkBad; } switch (status) { case 0: /* success */ break; case -ECONNRESET: /* unlink */ case -ENOENT: case -ESHUTDOWN: return; /* -EPIPE: should clear the halt */ default: /* error */ dev_dbg(&urb->dev->dev, "irq_done, status %d, data %02x %02x.\n", status, data[0], data[1]); goto resubmit; } if (linksts == LinkGood) { netif_carrier_on(catc->netdev); netdev_dbg(catc->netdev, "link ok\n"); } if (linksts == LinkBad) { netif_carrier_off(catc->netdev); netdev_dbg(catc->netdev, "link bad\n"); } if (hasdata) { if (test_and_set_bit(RX_RUNNING, &catc->flags)) { if (catc->is_f5u011) atomic_inc(&catc->recq_sz); } else { catc->rx_urb->dev = catc->usbdev; if ((res = usb_submit_urb(catc->rx_urb, GFP_ATOMIC)) < 0) { dev_err(&catc->usbdev->dev, "submit(rx_urb) status %d\n", res); } } } resubmit: res = usb_submit_urb (urb, GFP_ATOMIC); if (res) dev_err(&catc->usbdev->dev, "can't resubmit intr, %s-%s, status %d\n", catc->usbdev->bus->bus_name, catc->usbdev->devpath, res); } /* * Transmit routines. */ static int catc_tx_run(struct catc *catc) { int status; if (catc->is_f5u011) catc->tx_ptr = (catc->tx_ptr + 63) & ~63; catc->tx_urb->transfer_buffer_length = catc->tx_ptr; catc->tx_urb->transfer_buffer = catc->tx_buf[catc->tx_idx]; catc->tx_urb->dev = catc->usbdev; if ((status = usb_submit_urb(catc->tx_urb, GFP_ATOMIC)) < 0) dev_err(&catc->usbdev->dev, "submit(tx_urb), status %d\n", status); catc->tx_idx = !catc->tx_idx; catc->tx_ptr = 0; netif_trans_update(catc->netdev); return status; } static void catc_tx_done(struct urb *urb) { struct catc *catc = urb->context; unsigned long flags; int r, status = urb->status; if (status == -ECONNRESET) { dev_dbg(&urb->dev->dev, "Tx Reset.\n"); urb->status = 0; netif_trans_update(catc->netdev); catc->netdev->stats.tx_errors++; clear_bit(TX_RUNNING, &catc->flags); netif_wake_queue(catc->netdev); return; } if (status) { dev_dbg(&urb->dev->dev, "tx_done, status %d, length %d\n", status, urb->actual_length); return; } spin_lock_irqsave(&catc->tx_lock, flags); if (catc->tx_ptr) { r = catc_tx_run(catc); if (unlikely(r < 0)) clear_bit(TX_RUNNING, &catc->flags); } else { clear_bit(TX_RUNNING, &catc->flags); } netif_wake_queue(catc->netdev); spin_unlock_irqrestore(&catc->tx_lock, flags); } static netdev_tx_t catc_start_xmit(struct sk_buff *skb, struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); unsigned long flags; int r = 0; char *tx_buf; spin_lock_irqsave(&catc->tx_lock, flags); catc->tx_ptr = (((catc->tx_ptr - 1) >> 6) + 1) << 6; tx_buf = catc->tx_buf[catc->tx_idx] + catc->tx_ptr; if (catc->is_f5u011) *(__be16 *)tx_buf = cpu_to_be16(skb->len); else *(__le16 *)tx_buf = cpu_to_le16(skb->len); skb_copy_from_linear_data(skb, tx_buf + 2, skb->len); catc->tx_ptr += skb->len + 2; if (!test_and_set_bit(TX_RUNNING, &catc->flags)) { r = catc_tx_run(catc); if (r < 0) clear_bit(TX_RUNNING, &catc->flags); } if ((catc->is_f5u011 && catc->tx_ptr) || (catc->tx_ptr >= ((TX_MAX_BURST - 1) * (PKT_SZ + 2)))) netif_stop_queue(netdev); spin_unlock_irqrestore(&catc->tx_lock, flags); if (r >= 0) { catc->netdev->stats.tx_bytes += skb->len; catc->netdev->stats.tx_packets++; } dev_kfree_skb(skb); return NETDEV_TX_OK; } static void catc_tx_timeout(struct net_device *netdev, unsigned int txqueue) { struct catc *catc = netdev_priv(netdev); dev_warn(&netdev->dev, "Transmit timed out.\n"); usb_unlink_urb(catc->tx_urb); } /* * Control messages. */ static int catc_ctrl_msg(struct catc *catc, u8 dir, u8 request, u16 value, u16 index, void *buf, int len) { int retval = usb_control_msg(catc->usbdev, dir ? usb_rcvctrlpipe(catc->usbdev, 0) : usb_sndctrlpipe(catc->usbdev, 0), request, 0x40 | dir, value, index, buf, len, 1000); return retval < 0 ? retval : 0; } static void catc_ctrl_run(struct catc *catc) { struct ctrl_queue *q = catc->ctrl_queue + catc->ctrl_tail; struct usb_device *usbdev = catc->usbdev; struct urb *urb = catc->ctrl_urb; struct usb_ctrlrequest *dr = &catc->ctrl_dr; int status; dr->bRequest = q->request; dr->bRequestType = 0x40 | q->dir; dr->wValue = cpu_to_le16(q->value); dr->wIndex = cpu_to_le16(q->index); dr->wLength = cpu_to_le16(q->len); urb->pipe = q->dir ? usb_rcvctrlpipe(usbdev, 0) : usb_sndctrlpipe(usbdev, 0); urb->transfer_buffer_length = q->len; urb->transfer_buffer = catc->ctrl_buf; urb->setup_packet = (void *) dr; urb->dev = usbdev; if (!q->dir && q->buf && q->len) memcpy(catc->ctrl_buf, q->buf, q->len); if ((status = usb_submit_urb(catc->ctrl_urb, GFP_ATOMIC))) dev_err(&catc->usbdev->dev, "submit(ctrl_urb) status %d\n", status); } static void catc_ctrl_done(struct urb *urb) { struct catc *catc = urb->context; struct ctrl_queue *q; unsigned long flags; int status = urb->status; if (status) dev_dbg(&urb->dev->dev, "ctrl_done, status %d, len %d.\n", status, urb->actual_length); spin_lock_irqsave(&catc->ctrl_lock, flags); q = catc->ctrl_queue + catc->ctrl_tail; if (q->dir) { if (q->buf && q->len) memcpy(q->buf, catc->ctrl_buf, q->len); else q->buf = catc->ctrl_buf; } if (q->callback) q->callback(catc, q); catc->ctrl_tail = (catc->ctrl_tail + 1) & (CTRL_QUEUE - 1); if (catc->ctrl_head != catc->ctrl_tail) catc_ctrl_run(catc); else clear_bit(CTRL_RUNNING, &catc->flags); spin_unlock_irqrestore(&catc->ctrl_lock, flags); } static int catc_ctrl_async(struct catc *catc, u8 dir, u8 request, u16 value, u16 index, void *buf, int len, void (*callback)(struct catc *catc, struct ctrl_queue *q)) { struct ctrl_queue *q; int retval = 0; unsigned long flags; spin_lock_irqsave(&catc->ctrl_lock, flags); q = catc->ctrl_queue + catc->ctrl_head; q->dir = dir; q->request = request; q->value = value; q->index = index; q->buf = buf; q->len = len; q->callback = callback; catc->ctrl_head = (catc->ctrl_head + 1) & (CTRL_QUEUE - 1); if (catc->ctrl_head == catc->ctrl_tail) { dev_err(&catc->usbdev->dev, "ctrl queue full\n"); catc->ctrl_tail = (catc->ctrl_tail + 1) & (CTRL_QUEUE - 1); retval = -1; } if (!test_and_set_bit(CTRL_RUNNING, &catc->flags)) catc_ctrl_run(catc); spin_unlock_irqrestore(&catc->ctrl_lock, flags); return retval; } /* * Statistics. */ static void catc_stats_done(struct catc *catc, struct ctrl_queue *q) { int index = q->index - EthStats; u16 data, last; catc->stats_buf[index] = *((char *)q->buf); if (index & 1) return; data = ((u16)catc->stats_buf[index] << 8) | catc->stats_buf[index + 1]; last = catc->stats_vals[index >> 1]; switch (index) { case TxSingleColl: case TxMultiColl: catc->netdev->stats.collisions += data - last; break; case TxExcessColl: catc->netdev->stats.tx_aborted_errors += data - last; catc->netdev->stats.tx_errors += data - last; break; case RxFramErr: catc->netdev->stats.rx_frame_errors += data - last; catc->netdev->stats.rx_errors += data - last; break; } catc->stats_vals[index >> 1] = data; } static void catc_stats_timer(struct timer_list *t) { struct catc *catc = from_timer(catc, t, timer); int i; for (i = 0; i < 8; i++) catc_get_reg_async(catc, EthStats + 7 - i, catc_stats_done); mod_timer(&catc->timer, jiffies + STATS_UPDATE); } /* * Receive modes. Broadcast, Multicast, Promisc. */ static void catc_multicast(const unsigned char *addr, u8 *multicast) { u32 crc; crc = ether_crc_le(6, addr); multicast[(crc >> 3) & 0x3f] |= 1 << (crc & 7); } static void catc_set_multicast_list(struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); struct netdev_hw_addr *ha; u8 broadcast[ETH_ALEN]; u8 rx = RxEnable | RxPolarity | RxMultiCast; eth_broadcast_addr(broadcast); memset(catc->multicast, 0, 64); catc_multicast(broadcast, catc->multicast); catc_multicast(netdev->dev_addr, catc->multicast); if (netdev->flags & IFF_PROMISC) { memset(catc->multicast, 0xff, 64); rx |= (!catc->is_f5u011) ? RxPromisc : AltRxPromisc; } if (netdev->flags & IFF_ALLMULTI) { memset(catc->multicast, 0xff, 64); } else { netdev_for_each_mc_addr(ha, netdev) { u32 crc = ether_crc_le(6, ha->addr); if (!catc->is_f5u011) { catc->multicast[(crc >> 3) & 0x3f] |= 1 << (crc & 7); } else { catc->multicast[7-(crc >> 29)] |= 1 << ((crc >> 26) & 7); } } } if (!catc->is_f5u011) { catc_set_reg_async(catc, RxUnit, rx); catc_write_mem_async(catc, 0xfa80, catc->multicast, 64); } else { f5u011_mchash_async(catc, catc->multicast); if (catc->rxmode[0] != rx) { catc->rxmode[0] = rx; netdev_dbg(catc->netdev, "Setting RX mode to %2.2X %2.2X\n", catc->rxmode[0], catc->rxmode[1]); f5u011_rxmode_async(catc, catc->rxmode); } } } static void catc_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct catc *catc = netdev_priv(dev); strlcpy(info->driver, driver_name, sizeof(info->driver)); strlcpy(info->version, DRIVER_VERSION, sizeof(info->version)); usb_make_path(catc->usbdev, info->bus_info, sizeof(info->bus_info)); } static int catc_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct catc *catc = netdev_priv(dev); if (!catc->is_f5u011) return -EOPNOTSUPP; ethtool_link_ksettings_zero_link_mode(cmd, supported); ethtool_link_ksettings_add_link_mode(cmd, supported, 10baseT_Half); ethtool_link_ksettings_add_link_mode(cmd, supported, TP); ethtool_link_ksettings_zero_link_mode(cmd, advertising); ethtool_link_ksettings_add_link_mode(cmd, advertising, 10baseT_Half); ethtool_link_ksettings_add_link_mode(cmd, advertising, TP); cmd->base.speed = SPEED_10; cmd->base.duplex = DUPLEX_HALF; cmd->base.port = PORT_TP; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_DISABLE; return 0; } static const struct ethtool_ops ops = { .get_drvinfo = catc_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = catc_get_link_ksettings, }; /* * Open, close. */ static int catc_open(struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); int status; catc->irq_urb->dev = catc->usbdev; if ((status = usb_submit_urb(catc->irq_urb, GFP_KERNEL)) < 0) { dev_err(&catc->usbdev->dev, "submit(irq_urb) status %d\n", status); return -1; } netif_start_queue(netdev); if (!catc->is_f5u011) mod_timer(&catc->timer, jiffies + STATS_UPDATE); return 0; } static int catc_stop(struct net_device *netdev) { struct catc *catc = netdev_priv(netdev); netif_stop_queue(netdev); if (!catc->is_f5u011) del_timer_sync(&catc->timer); usb_kill_urb(catc->rx_urb); usb_kill_urb(catc->tx_urb); usb_kill_urb(catc->irq_urb); usb_kill_urb(catc->ctrl_urb); return 0; } static const struct net_device_ops catc_netdev_ops = { .ndo_open = catc_open, .ndo_stop = catc_stop, .ndo_start_xmit = catc_start_xmit, .ndo_tx_timeout = catc_tx_timeout, .ndo_set_rx_mode = catc_set_multicast_list, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; /* * USB probe, disconnect. */ static int catc_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct device *dev = &intf->dev; struct usb_device *usbdev = interface_to_usbdev(intf); struct net_device *netdev; struct catc *catc; u8 broadcast[ETH_ALEN]; int pktsz, ret; if (usb_set_interface(usbdev, intf->altsetting->desc.bInterfaceNumber, 1)) { dev_err(dev, "Can't set altsetting 1.\n"); return -EIO; } netdev = alloc_etherdev(sizeof(struct catc)); if (!netdev) return -ENOMEM; catc = netdev_priv(netdev); netdev->netdev_ops = &catc_netdev_ops; netdev->watchdog_timeo = TX_TIMEOUT; netdev->ethtool_ops = &ops; catc->usbdev = usbdev; catc->netdev = netdev; spin_lock_init(&catc->tx_lock); spin_lock_init(&catc->ctrl_lock); timer_setup(&catc->timer, catc_stats_timer, 0); catc->ctrl_urb = usb_alloc_urb(0, GFP_KERNEL); catc->tx_urb = usb_alloc_urb(0, GFP_KERNEL); catc->rx_urb = usb_alloc_urb(0, GFP_KERNEL); catc->irq_urb = usb_alloc_urb(0, GFP_KERNEL); if ((!catc->ctrl_urb) || (!catc->tx_urb) || (!catc->rx_urb) || (!catc->irq_urb)) { dev_err(&intf->dev, "No free urbs available.\n"); ret = -ENOMEM; goto fail_free; } /* The F5U011 has the same vendor/product as the netmate but a device version of 0x130 */ if (le16_to_cpu(usbdev->descriptor.idVendor) == 0x0423 && le16_to_cpu(usbdev->descriptor.idProduct) == 0xa && le16_to_cpu(catc->usbdev->descriptor.bcdDevice) == 0x0130) { dev_dbg(dev, "Testing for f5u011\n"); catc->is_f5u011 = 1; atomic_set(&catc->recq_sz, 0); pktsz = RX_PKT_SZ; } else { pktsz = RX_MAX_BURST * (PKT_SZ + 2); } usb_fill_control_urb(catc->ctrl_urb, usbdev, usb_sndctrlpipe(usbdev, 0), NULL, NULL, 0, catc_ctrl_done, catc); usb_fill_bulk_urb(catc->tx_urb, usbdev, usb_sndbulkpipe(usbdev, 1), NULL, 0, catc_tx_done, catc); usb_fill_bulk_urb(catc->rx_urb, usbdev, usb_rcvbulkpipe(usbdev, 1), catc->rx_buf, pktsz, catc_rx_done, catc); usb_fill_int_urb(catc->irq_urb, usbdev, usb_rcvintpipe(usbdev, 2), catc->irq_buf, 2, catc_irq_done, catc, 1); if (!catc->is_f5u011) { u32 *buf; int i; dev_dbg(dev, "Checking memory size\n"); buf = kmalloc(4, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto fail_free; } *buf = 0x12345678; catc_write_mem(catc, 0x7a80, buf, 4); *buf = 0x87654321; catc_write_mem(catc, 0xfa80, buf, 4); catc_read_mem(catc, 0x7a80, buf, 4); switch (*buf) { case 0x12345678: catc_set_reg(catc, TxBufCount, 8); catc_set_reg(catc, RxBufCount, 32); dev_dbg(dev, "64k Memory\n"); break; default: dev_warn(&intf->dev, "Couldn't detect memory size, assuming 32k\n"); fallthrough; case 0x87654321: catc_set_reg(catc, TxBufCount, 4); catc_set_reg(catc, RxBufCount, 16); dev_dbg(dev, "32k Memory\n"); break; } kfree(buf); dev_dbg(dev, "Getting MAC from SEEROM.\n"); catc_get_mac(catc, netdev->dev_addr); dev_dbg(dev, "Setting MAC into registers.\n"); for (i = 0; i < 6; i++) catc_set_reg(catc, StationAddr0 - i, netdev->dev_addr[i]); dev_dbg(dev, "Filling the multicast list.\n"); eth_broadcast_addr(broadcast); catc_multicast(broadcast, catc->multicast); catc_multicast(netdev->dev_addr, catc->multicast); catc_write_mem(catc, 0xfa80, catc->multicast, 64); dev_dbg(dev, "Clearing error counters.\n"); for (i = 0; i < 8; i++) catc_set_reg(catc, EthStats + i, 0); catc->last_stats = jiffies; dev_dbg(dev, "Enabling.\n"); catc_set_reg(catc, MaxBurst, RX_MAX_BURST); catc_set_reg(catc, OpModes, OpTxMerge | OpRxMerge | OpLenInclude | Op3MemWaits); catc_set_reg(catc, LEDCtrl, LEDLink); catc_set_reg(catc, RxUnit, RxEnable | RxPolarity | RxMultiCast); } else { dev_dbg(dev, "Performing reset\n"); catc_reset(catc); catc_get_mac(catc, netdev->dev_addr); dev_dbg(dev, "Setting RX Mode\n"); catc->rxmode[0] = RxEnable | RxPolarity | RxMultiCast; catc->rxmode[1] = 0; f5u011_rxmode(catc, catc->rxmode); } dev_dbg(dev, "Init done.\n"); printk(KERN_INFO "%s: %s USB Ethernet at usb-%s-%s, %pM.\n", netdev->name, (catc->is_f5u011) ? "Belkin F5U011" : "CATC EL1210A NetMate", usbdev->bus->bus_name, usbdev->devpath, netdev->dev_addr); usb_set_intfdata(intf, catc); SET_NETDEV_DEV(netdev, &intf->dev); ret = register_netdev(netdev); if (ret) goto fail_clear_intfdata; return 0; fail_clear_intfdata: usb_set_intfdata(intf, NULL); fail_free: usb_free_urb(catc->ctrl_urb); usb_free_urb(catc->tx_urb); usb_free_urb(catc->rx_urb); usb_free_urb(catc->irq_urb); free_netdev(netdev); return ret; } static void catc_disconnect(struct usb_interface *intf) { struct catc *catc = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); if (catc) { unregister_netdev(catc->netdev); usb_free_urb(catc->ctrl_urb); usb_free_urb(catc->tx_urb); usb_free_urb(catc->rx_urb); usb_free_urb(catc->irq_urb); free_netdev(catc->netdev); } } /* * Module functions and tables. */ static const struct usb_device_id catc_id_table[] = { { USB_DEVICE(0x0423, 0xa) }, /* CATC Netmate, Belkin F5U011 */ { USB_DEVICE(0x0423, 0xc) }, /* CATC Netmate II, Belkin F5U111 */ { USB_DEVICE(0x08d1, 0x1) }, /* smartBridges smartNIC */ { } }; MODULE_DEVICE_TABLE(usb, catc_id_table); static struct usb_driver catc_driver = { .name = driver_name, .probe = catc_probe, .disconnect = catc_disconnect, .id_table = catc_id_table, .disable_hub_initiated_lpm = 1, }; module_usb_driver(catc_driver);
34 39 6 47 10 1 1 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner */ #ifndef _NET_BATMAN_ADV_HASH_H_ #define _NET_BATMAN_ADV_HASH_H_ #include "main.h" #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> /* callback to a compare function. should compare 2 element data for their * keys * * Return: true if same and false if not same */ typedef bool (*batadv_hashdata_compare_cb)(const struct hlist_node *, const void *); /* the hashfunction * * Return: an index based on the key in the data of the first argument and the * size the second */ typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32); typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *); /** * struct batadv_hashtable - Wrapper of simple hlist based hashtable */ struct batadv_hashtable { /** @table: the hashtable itself with the buckets */ struct hlist_head *table; /** @list_locks: spinlock for each hash list entry */ spinlock_t *list_locks; /** @size: size of hashtable */ u32 size; /** @generation: current (generation) sequence number */ atomic_t generation; }; /* allocates and clears the hash */ struct batadv_hashtable *batadv_hash_new(u32 size); /* set class key for all locks */ void batadv_hash_set_lock_class(struct batadv_hashtable *hash, struct lock_class_key *key); /* free only the hashtable and the hash itself. */ void batadv_hash_destroy(struct batadv_hashtable *hash); /** * batadv_hash_add() - adds data to the hashtable * @hash: storage hash table * @compare: callback to determine if 2 hash elements are identical * @choose: callback calculating the hash index * @data: data passed to the aforementioned callbacks as argument * @data_node: to be added element * * Return: 0 on success, 1 if the element already is in the hash * and -1 on error. */ static inline int batadv_hash_add(struct batadv_hashtable *hash, batadv_hashdata_compare_cb compare, batadv_hashdata_choose_cb choose, const void *data, struct hlist_node *data_node) { u32 index; int ret = -1; struct hlist_head *head; struct hlist_node *node; spinlock_t *list_lock; /* spinlock to protect write access */ if (!hash) goto out; index = choose(data, hash->size); head = &hash->table[index]; list_lock = &hash->list_locks[index]; spin_lock_bh(list_lock); hlist_for_each(node, head) { if (!compare(node, data)) continue; ret = 1; goto unlock; } /* no duplicate found in list, add new element */ hlist_add_head_rcu(data_node, head); atomic_inc(&hash->generation); ret = 0; unlock: spin_unlock_bh(list_lock); out: return ret; } /** * batadv_hash_remove() - Removes data from hash, if found * @hash: hash table * @compare: callback to determine if 2 hash elements are identical * @choose: callback calculating the hash index * @data: data passed to the aforementioned callbacks as argument * * ata could be the structure you use with just the key filled, we just need * the key for comparing. * * Return: returns pointer do data on success, so you can remove the used * structure yourself, or NULL on error */ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, batadv_hashdata_compare_cb compare, batadv_hashdata_choose_cb choose, void *data) { u32 index; struct hlist_node *node; struct hlist_head *head; void *data_save = NULL; index = choose(data, hash->size); head = &hash->table[index]; spin_lock_bh(&hash->list_locks[index]); hlist_for_each(node, head) { if (!compare(node, data)) continue; data_save = node; hlist_del_rcu(node); atomic_inc(&hash->generation); break; } spin_unlock_bh(&hash->list_locks[index]); return data_save; } #endif /* _NET_BATMAN_ADV_HASH_H_ */
141 114 31 138 4 137 6 128 15 208 106 82 11 4 6 43 19 23 5 6 6 4 7 4 5 4 51 11 19 12 1 27 75 14 90 44 43 5 6 2 7 5 26 256 190 73 187 91 54 9 3 5 71 10 46 11 8 2 18 2 17 7 4 2 4 2 14 5 15 6 1 6 34 11 21 17 9 18 17 7 16 85 160 89 70 12 10 2 11 1 4 5 2 1 7 1 87 71 10 75 55 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The options processing module for ip.c * * Authors: A.N.Kuznetsov * */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/uaccess.h> #include <asm/unaligned.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/cipso_ipv4.h> #include <net/ip_fib.h> /* * Write options to IP header, record destination address to * source route option, address of outgoing interface * (we should already know it, so that this function is allowed be * called only after routing decision) and timestamp, * if we originate this datagram. * * daddr is real destination address, next hop is recorded in IP header. * saddr is address of outgoing interface. */ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt, int is_frag) { unsigned char *iph = skb_network_header(skb); memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen); opt = &(IPCB(skb)->opt); if (opt->srr) memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4); if (!is_frag) { if (opt->rr_needaddr) ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt); if (opt->ts_needaddr) ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt); if (opt->ts_needtime) { __be32 midtime; midtime = inet_current_timestamp(); memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4); } return; } if (opt->rr) { memset(iph + opt->rr, IPOPT_NOP, iph[opt->rr + 1]); opt->rr = 0; opt->rr_needaddr = 0; } if (opt->ts) { memset(iph + opt->ts, IPOPT_NOP, iph[opt->ts + 1]); opt->ts = 0; opt->ts_needaddr = opt->ts_needtime = 0; } } /* * Provided (sopt, skb) points to received options, * build in dopt compiled option set appropriate for answering. * i.e. invert SRR option, copy anothers, * and grab room in RR/TS options. * * NOTE: dopt cannot point to skb. */ int __ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb, const struct ip_options *sopt) { unsigned char *sptr, *dptr; int soffset, doffset; int optlen; memset(dopt, 0, sizeof(struct ip_options)); if (sopt->optlen == 0) return 0; sptr = skb_network_header(skb); dptr = dopt->__data; if (sopt->rr) { optlen = sptr[sopt->rr+1]; soffset = sptr[sopt->rr+2]; dopt->rr = dopt->optlen + sizeof(struct iphdr); memcpy(dptr, sptr+sopt->rr, optlen); if (sopt->rr_needaddr && soffset <= optlen) { if (soffset + 3 > optlen) return -EINVAL; dptr[2] = soffset + 4; dopt->rr_needaddr = 1; } dptr += optlen; dopt->optlen += optlen; } if (sopt->ts) { optlen = sptr[sopt->ts+1]; soffset = sptr[sopt->ts+2]; dopt->ts = dopt->optlen + sizeof(struct iphdr); memcpy(dptr, sptr+sopt->ts, optlen); if (soffset <= optlen) { if (sopt->ts_needaddr) { if (soffset + 3 > optlen) return -EINVAL; dopt->ts_needaddr = 1; soffset += 4; } if (sopt->ts_needtime) { if (soffset + 3 > optlen) return -EINVAL; if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) { dopt->ts_needtime = 1; soffset += 4; } else { dopt->ts_needtime = 0; if (soffset + 7 <= optlen) { __be32 addr; memcpy(&addr, dptr+soffset-1, 4); if (inet_addr_type(net, addr) != RTN_UNICAST) { dopt->ts_needtime = 1; soffset += 8; } } } } dptr[2] = soffset; } dptr += optlen; dopt->optlen += optlen; } if (sopt->srr) { unsigned char *start = sptr+sopt->srr; __be32 faddr; optlen = start[1]; soffset = start[2]; doffset = 0; if (soffset > optlen) soffset = optlen + 1; soffset -= 4; if (soffset > 3) { memcpy(&faddr, &start[soffset-1], 4); for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4) memcpy(&dptr[doffset-1], &start[soffset-1], 4); /* * RFC1812 requires to fix illegal source routes. */ if (memcmp(&ip_hdr(skb)->saddr, &start[soffset + 3], 4) == 0) doffset -= 4; } if (doffset > 3) { dopt->faddr = faddr; dptr[0] = start[0]; dptr[1] = doffset+3; dptr[2] = 4; dptr += doffset+3; dopt->srr = dopt->optlen + sizeof(struct iphdr); dopt->optlen += doffset+3; dopt->is_strictroute = sopt->is_strictroute; } } if (sopt->cipso) { optlen = sptr[sopt->cipso+1]; dopt->cipso = dopt->optlen+sizeof(struct iphdr); memcpy(dptr, sptr+sopt->cipso, optlen); dptr += optlen; dopt->optlen += optlen; } while (dopt->optlen & 3) { *dptr++ = IPOPT_END; dopt->optlen++; } return 0; } /* * Options "fragmenting", just fill options not * allowed in fragments with NOOPs. * Simple and stupid 8), but the most efficient way. */ void ip_options_fragment(struct sk_buff *skb) { unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); struct ip_options *opt = &(IPCB(skb)->opt); int l = opt->optlen; int optlen; while (l > 0) { switch (*optptr) { case IPOPT_END: return; case IPOPT_NOOP: l--; optptr++; continue; } optlen = optptr[1]; if (optlen < 2 || optlen > l) return; if (!IPOPT_COPIED(*optptr)) memset(optptr, IPOPT_NOOP, optlen); l -= optlen; optptr += optlen; } opt->ts = 0; opt->rr = 0; opt->rr_needaddr = 0; opt->ts_needaddr = 0; opt->ts_needtime = 0; } /* helper used by ip_options_compile() to call fib_compute_spec_dst() * at most one time. */ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) { if (*spec_dst == htonl(INADDR_ANY)) *spec_dst = fib_compute_spec_dst(skb); } /* * Verify options and fill pointers in struct options. * Caller should clear *opt, and set opt->data. * If opt == NULL, then skb->data should point to IP header. */ int __ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb, __be32 *info) { __be32 spec_dst = htonl(INADDR_ANY); unsigned char *pp_ptr = NULL; struct rtable *rt = NULL; unsigned char *optptr; unsigned char *iph; int optlen, l; if (skb) { rt = skb_rtable(skb); optptr = (unsigned char *)&(ip_hdr(skb)[1]); } else optptr = opt->__data; iph = optptr - sizeof(struct iphdr); for (l = opt->optlen; l > 0; ) { switch (*optptr) { case IPOPT_END: for (optptr++, l--; l > 0; optptr++, l--) { if (*optptr != IPOPT_END) { *optptr = IPOPT_END; opt->is_changed = 1; } } goto eol; case IPOPT_NOOP: l--; optptr++; continue; } if (unlikely(l < 2)) { pp_ptr = optptr; goto error; } optlen = optptr[1]; if (optlen < 2 || optlen > l) { pp_ptr = optptr; goto error; } switch (*optptr) { case IPOPT_SSRR: case IPOPT_LSRR: if (optlen < 3) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 4) { pp_ptr = optptr + 2; goto error; } /* NB: cf RFC-1812 5.2.4.1 */ if (opt->srr) { pp_ptr = optptr; goto error; } if (!skb) { if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) { pp_ptr = optptr + 1; goto error; } memcpy(&opt->faddr, &optptr[3], 4); if (optlen > 7) memmove(&optptr[3], &optptr[7], optlen-7); } opt->is_strictroute = (optptr[0] == IPOPT_SSRR); opt->srr = optptr - iph; break; case IPOPT_RR: if (opt->rr) { pp_ptr = optptr; goto error; } if (optlen < 3) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 4) { pp_ptr = optptr + 2; goto error; } if (optptr[2] <= optlen) { if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); opt->is_changed = 1; } optptr[2] += 4; opt->rr_needaddr = 1; } opt->rr = optptr - iph; break; case IPOPT_TIMESTAMP: if (opt->ts) { pp_ptr = optptr; goto error; } if (optlen < 4) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 5) { pp_ptr = optptr + 2; goto error; } if (optptr[2] <= optlen) { unsigned char *timeptr = NULL; if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } switch (optptr[3]&0xF) { case IPOPT_TS_TSONLY: if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; optptr[2] += 4; break; case IPOPT_TS_TSANDADDR: if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); timeptr = &optptr[optptr[2]+3]; } opt->ts_needaddr = 1; opt->ts_needtime = 1; optptr[2] += 8; break; case IPOPT_TS_PRESPEC: if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); if (inet_addr_type(net, addr) == RTN_UNICAST) break; if (skb) timeptr = &optptr[optptr[2]+3]; } opt->ts_needtime = 1; optptr[2] += 8; break; default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr + 3; goto error; } break; } if (timeptr) { __be32 midtime; midtime = inet_current_timestamp(); memcpy(timeptr, &midtime, 4); opt->is_changed = 1; } } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { unsigned int overflow = optptr[3]>>4; if (overflow == 15) { pp_ptr = optptr + 3; goto error; } if (skb) { optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } opt->ts = optptr - iph; break; case IPOPT_RA: if (optlen < 4) { pp_ptr = optptr + 1; goto error; } if (optptr[2] == 0 && optptr[3] == 0) opt->router_alert = optptr - iph; break; case IPOPT_CIPSO: if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { pp_ptr = optptr; goto error; } opt->cipso = optptr - iph; if (cipso_v4_validate(skb, &optptr)) { pp_ptr = optptr; goto error; } break; case IPOPT_SEC: case IPOPT_SID: default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr; goto error; } break; } l -= optlen; optptr += optlen; } eol: if (!pp_ptr) return 0; error: if (info) *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } EXPORT_SYMBOL(__ip_options_compile); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) { int ret; __be32 info; ret = __ip_options_compile(net, opt, skb, &info); if (ret != 0 && skb) icmp_send(skb, ICMP_PARAMETERPROB, 0, info); return ret; } EXPORT_SYMBOL(ip_options_compile); /* * Undo all the changes done by ip_options_compile(). */ void ip_options_undo(struct ip_options *opt) { if (opt->srr) { unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr); memmove(optptr + 7, optptr + 3, optptr[1] - 7); memcpy(optptr + 3, &opt->faddr, 4); } if (opt->rr_needaddr) { unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr); optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); } if (opt->ts) { unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr); if (opt->ts_needtime) { optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC) optptr[2] -= 4; } if (opt->ts_needaddr) { optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); } } } int ip_options_get(struct net *net, struct ip_options_rcu **optp, sockptr_t data, int optlen) { struct ip_options_rcu *opt; opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), GFP_KERNEL); if (!opt) return -ENOMEM; if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) { kfree(opt); return -EFAULT; } while (optlen & 3) opt->opt.__data[optlen++] = IPOPT_END; opt->opt.optlen = optlen; if (optlen && ip_options_compile(net, &opt->opt, NULL)) { kfree(opt); return -EINVAL; } kfree(*optp); *optp = opt; return 0; } void ip_forward_options(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); unsigned char *optptr; struct rtable *rt = skb_rtable(skb); unsigned char *raw = skb_network_header(skb); if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; ip_rt_get_source(&optptr[optptr[2]-5], skb, rt); opt->is_changed = 1; } if (opt->srr_is_hit) { int srrptr, srrspace; optptr = raw + opt->srr; for ( srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) { if (srrptr + 3 > srrspace) break; if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0) break; } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; ip_hdr(skb)->daddr = opt->nexthop; ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; } else { net_crit_ratelimited("%s(): Argh! Destination lost!\n", __func__); } if (opt->ts_needaddr) { optptr = raw + opt->ts; ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); opt->is_changed = 1; } } if (opt->is_changed) { opt->is_changed = 0; ip_send_check(ip_hdr(skb)); } } int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt = &(IPCB(skb)->opt); int srrspace, srrptr; __be32 nexthop; struct iphdr *iph = ip_hdr(skb); unsigned char *optptr = skb_network_header(skb) + opt->srr; struct rtable *rt = skb_rtable(skb); struct rtable *rt2; unsigned long orefdst; int err; if (!rt) return 0; if (skb->pkt_type != PACKET_HOST) return -EINVAL; if (rt->rt_type == RTN_UNICAST) { if (!opt->is_strictroute) return 0; icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24)); return -EINVAL; } if (rt->rt_type != RTN_LOCAL) return -EINVAL; for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); return -EINVAL; } memcpy(&nexthop, &optptr[srrptr-1], 4); orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, dev); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); skb->_skb_refdst = orefdst; return -EINVAL; } refdst_drop(orefdst); if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ iph->daddr = nexthop; opt->is_changed = 1; } if (srrptr <= srrspace) { opt->srr_is_hit = 1; opt->nexthop = nexthop; opt->is_changed = 1; } return 0; } EXPORT_SYMBOL(ip_options_rcv_srr);
6 23 3 6 36 14 48 10 56 7 52 42 41 50 3 5 45 45 95 95 95 83 21 47 16 25 6 36 58 49 37 49 49 3 49 2 20 7 10 4 3 3 1 1 1 2 3 22 470 167 374 423 59 77 78 60 22 129 490 4 18 13 16 8 146 10 5 107 376 274 115 114 115 111 275 275 275 352 10 6 368 283 85 365 14 365 14 380 379 380 10 367 379 17 1 49 33 17 178 15 2 7 40 6 108 3 183 266 5 474 475 81 10 2 7 2 20 20 189 140 20 70 21 51 70 29 41 19 55 52 1 51 148 17 15 2 15 2 10 7 17 8 9 2 1 4 2 17 14 3 11 6 13 4 11 6 4 4 8 5 4 7 2 7 2 9 13 4 10 4 3 14 3 12 5 13 4 266 2 262 192 183 13 40 14 10 12 7 3 29 18 14 1 15 1 20 1 16 1 28 1 15 1 19 58 35 1 1 9 14 19 12 34 21 51 50 48 94 94 76 61 71 60 79 76 87 87 83 6 76 18 1 1 6 236 14 18 232 185 65 233 17 172 55 117 118 58 6 4 3 1 2 2 2 2 2 11 7 4 1 221 3 2 6 6 7 5 7 6 7 9 8 6 2 25 17 8 1 1 266 266 1 249 34 35 180 52 128 122 1 12 41 4 2 1 56 120 176 1 1 171 3 1 46 126 1 5 152 15 69 99 114 75 38 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/fat/inode.c * * Written 1992,1993 by Werner Almesberger * VFAT extensions by Gordon Chaffee, merged with msdos fs by Henrik Storner * Rewritten for the constant inumbers support by Al Viro * * Fixes: * * Max Cohan: Fixed invalid FSINFO offset when info_sector is 0 */ #include <linux/module.h> #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/vfs.h> #include <linux/seq_file.h> #include <linux/parser.h> #include <linux/uio.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <asm/unaligned.h> #include <linux/random.h> #include <linux/iversion.h> #include "fat.h" #ifndef CONFIG_FAT_DEFAULT_IOCHARSET /* if user don't select VFAT, this is undefined. */ #define CONFIG_FAT_DEFAULT_IOCHARSET "" #endif #define KB_IN_SECTORS 2 /* DOS dates from 1980/1/1 through 2107/12/31 */ #define FAT_DATE_MIN (0<<9 | 1<<5 | 1) #define FAT_DATE_MAX (127<<9 | 12<<5 | 31) #define FAT_TIME_MAX (23<<11 | 59<<5 | 29) /* * A deserialized copy of the on-disk structure laid out in struct * fat_boot_sector. */ struct fat_bios_param_block { u16 fat_sector_size; u8 fat_sec_per_clus; u16 fat_reserved; u8 fat_fats; u16 fat_dir_entries; u16 fat_sectors; u16 fat_fat_length; u32 fat_total_sect; u8 fat16_state; u32 fat16_vol_id; u32 fat32_length; u32 fat32_root_cluster; u16 fat32_info_sector; u8 fat32_state; u32 fat32_vol_id; }; static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE; static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET; static struct fat_floppy_defaults { unsigned nr_sectors; unsigned sec_per_clus; unsigned dir_entries; unsigned media; unsigned fat_length; } floppy_defaults[] = { { .nr_sectors = 160 * KB_IN_SECTORS, .sec_per_clus = 1, .dir_entries = 64, .media = 0xFE, .fat_length = 1, }, { .nr_sectors = 180 * KB_IN_SECTORS, .sec_per_clus = 1, .dir_entries = 64, .media = 0xFC, .fat_length = 2, }, { .nr_sectors = 320 * KB_IN_SECTORS, .sec_per_clus = 2, .dir_entries = 112, .media = 0xFF, .fat_length = 1, }, { .nr_sectors = 360 * KB_IN_SECTORS, .sec_per_clus = 2, .dir_entries = 112, .media = 0xFD, .fat_length = 2, }, }; int fat_add_cluster(struct inode *inode) { int err, cluster; err = fat_alloc_clusters(inode, &cluster, 1); if (err) return err; /* FIXME: this cluster should be added after data of this * cluster is writed */ err = fat_chain_add(inode, cluster, 1); if (err) fat_free_clusters(inode, cluster); return err; } static inline int __fat_get_block(struct inode *inode, sector_t iblock, unsigned long *max_blocks, struct buffer_head *bh_result, int create) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); unsigned long mapped_blocks; sector_t phys, last_block; int err, offset; err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; if (phys) { map_bh(bh_result, sb, phys); *max_blocks = min(mapped_blocks, *max_blocks); return 0; } if (!create) return 0; if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) { fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)", MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private); return -EIO; } last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9); offset = (unsigned long)iblock & (sbi->sec_per_clus - 1); /* * allocate a cluster according to the following. * 1) no more available blocks * 2) not part of fallocate region */ if (!offset && !(iblock < last_block)) { /* TODO: multiple cluster allocation would be desirable. */ err = fat_add_cluster(inode); if (err) return err; } /* available blocks on this cluster */ mapped_blocks = sbi->sec_per_clus - offset; *max_blocks = min(mapped_blocks, *max_blocks); MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; if (!phys) { fat_fs_error(sb, "invalid FAT chain (i_pos %lld, last_block %llu)", MSDOS_I(inode)->i_pos, (unsigned long long)last_block); return -EIO; } BUG_ON(*max_blocks != mapped_blocks); set_buffer_new(bh_result); map_bh(bh_result, sb, phys); return 0; } static int fat_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { struct super_block *sb = inode->i_sb; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; int err; err = __fat_get_block(inode, iblock, &max_blocks, bh_result, create); if (err) return err; bh_result->b_size = max_blocks << sb->s_blocksize_bits; return 0; } static int fat_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, fat_get_block, wbc); } static int fat_writepages(struct address_space *mapping, struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, fat_get_block); } static int fat_readpage(struct file *file, struct page *page) { return mpage_readpage(page, fat_get_block); } static void fat_readahead(struct readahead_control *rac) { mpage_readahead(rac, fat_get_block); } static void fat_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; if (to > inode->i_size) { truncate_pagecache(inode, inode->i_size); fat_truncate_blocks(inode, inode->i_size); } } static int fat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { int err; *pagep = NULL; err = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, fat_get_block, &MSDOS_I(mapping->host)->mmu_private); if (err < 0) fat_write_failed(mapping, pos + len); return err; } static int fat_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *pagep, void *fsdata) { struct inode *inode = mapping->host; int err; err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { fat_truncate_time(inode, NULL, S_CTIME|S_MTIME); MSDOS_I(inode)->i_attrs |= ATTR_ARCH; mark_inode_dirty(inode); } return err; } static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; ssize_t ret; if (iov_iter_rw(iter) == WRITE) { /* * FIXME: blockdev_direct_IO() doesn't use ->write_begin(), * so we need to update the ->mmu_private to block boundary. * * But we must fill the remaining area or hole by nul for * updating ->mmu_private. * * Return 0, and fallback to normal buffered write. */ loff_t size = offset + count; if (MSDOS_I(inode)->mmu_private < size) return 0; } /* * FAT need to use the DIO_LOCKING for avoiding the race * condition of fat_get_block() and ->truncate(). */ ret = blockdev_direct_IO(iocb, inode, iter, fat_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) fat_write_failed(mapping, offset + count); return ret; } static int fat_get_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { struct super_block *sb = inode->i_sb; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; int err; sector_t bmap; unsigned long mapped_blocks; BUG_ON(create != 0); err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true); if (err) return err; if (bmap) { map_bh(bh_result, sb, bmap); max_blocks = min(mapped_blocks, max_blocks); } bh_result->b_size = max_blocks << sb->s_blocksize_bits; return 0; } static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&MSDOS_I(mapping->host)->truncate_lock); blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; } /* * fat_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This is required during truncate to physically zeroout the tail end * of that block so it doesn't yield old data if the file is later grown. * Also, avoid causing failure from fsx for cases of "data past EOF" */ int fat_block_truncate_page(struct inode *inode, loff_t from) { return block_truncate_page(inode->i_mapping, from, fat_get_block); } static const struct address_space_operations fat_aops = { .set_page_dirty = __set_page_dirty_buffers, .readpage = fat_readpage, .readahead = fat_readahead, .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, .write_end = fat_write_end, .direct_IO = fat_direct_IO, .bmap = _fat_bmap }; /* * New FAT inode stuff. We do the following: * a) i_ino is constant and has nothing with on-disk location. * b) FAT manages its own cache of directory entries. * c) *This* cache is indexed by on-disk location. * d) inode has an associated directory entry, all right, but * it may be unhashed. * e) currently entries are stored within struct inode. That should * change. * f) we deal with races in the following way: * 1. readdir() and lookup() do FAT-dir-cache lookup. * 2. rename() unhashes the F-d-c entry and rehashes it in * a new place. * 3. unlink() and rmdir() unhash F-d-c entry. * 4. fat_write_inode() checks whether the thing is unhashed. * If it is we silently return. If it isn't we do bread(), * check if the location is still valid and retry if it * isn't. Otherwise we do changes. * 5. Spinlock is used to protect hash/unhash/location check/lookup * 6. fat_evict_inode() unhashes the F-d-c entry. * 7. lookup() and readdir() do igrab() if they find a F-d-c entry * and consider negative result as cache miss. */ static void fat_hash_init(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); int i; spin_lock_init(&sbi->inode_hash_lock); for (i = 0; i < FAT_HASH_SIZE; i++) INIT_HLIST_HEAD(&sbi->inode_hashtable[i]); } static inline unsigned long fat_hash(loff_t i_pos) { return hash_32(i_pos, FAT_HASH_BITS); } static void dir_hash_init(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); int i; spin_lock_init(&sbi->dir_hash_lock); for (i = 0; i < FAT_HASH_SIZE; i++) INIT_HLIST_HEAD(&sbi->dir_hashtable[i]); } void fat_attach(struct inode *inode, loff_t i_pos) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); if (inode->i_ino != MSDOS_ROOT_INO) { struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); spin_lock(&sbi->inode_hash_lock); MSDOS_I(inode)->i_pos = i_pos; hlist_add_head(&MSDOS_I(inode)->i_fat_hash, head); spin_unlock(&sbi->inode_hash_lock); } /* If NFS support is enabled, cache the mapping of start cluster * to directory inode. This is used during reconnection of * dentries to the filesystem root. */ if (S_ISDIR(inode->i_mode) && sbi->options.nfs) { struct hlist_head *d_head = sbi->dir_hashtable; d_head += fat_dir_hash(MSDOS_I(inode)->i_logstart); spin_lock(&sbi->dir_hash_lock); hlist_add_head(&MSDOS_I(inode)->i_dir_hash, d_head); spin_unlock(&sbi->dir_hash_lock); } } EXPORT_SYMBOL_GPL(fat_attach); void fat_detach(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); spin_lock(&sbi->inode_hash_lock); MSDOS_I(inode)->i_pos = 0; hlist_del_init(&MSDOS_I(inode)->i_fat_hash); spin_unlock(&sbi->inode_hash_lock); if (S_ISDIR(inode->i_mode) && sbi->options.nfs) { spin_lock(&sbi->dir_hash_lock); hlist_del_init(&MSDOS_I(inode)->i_dir_hash); spin_unlock(&sbi->dir_hash_lock); } } EXPORT_SYMBOL_GPL(fat_detach); struct inode *fat_iget(struct super_block *sb, loff_t i_pos) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct hlist_head *head = sbi->inode_hashtable + fat_hash(i_pos); struct msdos_inode_info *i; struct inode *inode = NULL; spin_lock(&sbi->inode_hash_lock); hlist_for_each_entry(i, head, i_fat_hash) { BUG_ON(i->vfs_inode.i_sb != sb); if (i->i_pos != i_pos) continue; inode = igrab(&i->vfs_inode); if (inode) break; } spin_unlock(&sbi->inode_hash_lock); return inode; } static int is_exec(unsigned char *extension) { unsigned char exe_extensions[] = "EXECOMBAT", *walk; for (walk = exe_extensions; *walk; walk += 3) if (!strncmp(extension, walk, 3)) return 1; return 0; } static int fat_calc_dir_size(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int ret, fclus, dclus; inode->i_size = 0; if (MSDOS_I(inode)->i_start == 0) return 0; ret = fat_get_cluster(inode, FAT_ENT_EOF, &fclus, &dclus); if (ret < 0) return ret; inode->i_size = (fclus + 1) << sbi->cluster_bits; return 0; } static int fat_validate_dir(struct inode *dir) { struct super_block *sb = dir->i_sb; if (dir->i_nlink < 2) { /* Directory should have "."/".." entries at least. */ fat_fs_error(sb, "corrupted directory (invalid entries)"); return -EIO; } if (MSDOS_I(dir)->i_start == 0 || MSDOS_I(dir)->i_start == MSDOS_SB(sb)->root_cluster) { /* Directory should point valid cluster. */ fat_fs_error(sb, "corrupted directory (invalid i_start)"); return -EIO; } return 0; } /* doesn't deal with root inode */ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; MSDOS_I(inode)->i_pos = 0; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); inode->i_generation = prandom_u32(); if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { inode->i_generation &= ~1; inode->i_mode = fat_make_mode(sbi, de->attr, S_IRWXUGO); inode->i_op = sbi->dir_ops; inode->i_fop = &fat_dir_operations; MSDOS_I(inode)->i_start = fat_get_start(sbi, de); MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; error = fat_calc_dir_size(inode); if (error < 0) return error; MSDOS_I(inode)->mmu_private = inode->i_size; set_nlink(inode, fat_subdirs(inode)); error = fat_validate_dir(inode); if (error < 0) return error; } else { /* not a directory */ inode->i_generation |= 1; inode->i_mode = fat_make_mode(sbi, de->attr, ((sbi->options.showexec && !is_exec(de->name + 8)) ? S_IRUGO|S_IWUGO : S_IRWXUGO)); MSDOS_I(inode)->i_start = fat_get_start(sbi, de); MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; inode->i_size = le32_to_cpu(de->size); inode->i_op = &fat_file_inode_operations; inode->i_fop = &fat_file_operations; inode->i_mapping->a_ops = &fat_aops; MSDOS_I(inode)->mmu_private = inode->i_size; } if (de->attr & ATTR_SYS) { if (sbi->options.sys_immutable) inode->i_flags |= S_IMMUTABLE; } fat_save_attrs(inode, de->attr); inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); if (sbi->options.isvfat) { fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, de->cdate, de->ctime_cs); fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); } else fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME); return 0; } static inline void fat_lock_build_inode(struct msdos_sb_info *sbi) { if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) mutex_lock(&sbi->nfs_build_inode_lock); } static inline void fat_unlock_build_inode(struct msdos_sb_info *sbi) { if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) mutex_unlock(&sbi->nfs_build_inode_lock); } struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos) { struct inode *inode; int err; fat_lock_build_inode(MSDOS_SB(sb)); inode = fat_iget(sb, i_pos); if (inode) goto out; inode = new_inode(sb); if (!inode) { inode = ERR_PTR(-ENOMEM); goto out; } inode->i_ino = iunique(sb, MSDOS_ROOT_INO); inode_set_iversion(inode, 1); err = fat_fill_inode(inode, de); if (err) { iput(inode); inode = ERR_PTR(err); goto out; } fat_attach(inode, i_pos); insert_inode_hash(inode); out: fat_unlock_build_inode(MSDOS_SB(sb)); return inode; } EXPORT_SYMBOL_GPL(fat_build_inode); static int __fat_write_inode(struct inode *inode, int wait); static void fat_free_eofblocks(struct inode *inode) { /* Release unwritten fallocated blocks on inode eviction. */ if ((inode->i_blocks << 9) > round_up(MSDOS_I(inode)->mmu_private, MSDOS_SB(inode->i_sb)->cluster_size)) { int err; fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private); /* Fallocate results in updating the i_start/iogstart * for the zero byte file. So, make it return to * original state during evict and commit it to avoid * any corruption on the next access to the cluster * chain for the file. */ err = __fat_write_inode(inode, inode_needs_sync(inode)); if (err) { fat_msg(inode->i_sb, KERN_WARNING, "Failed to " "update on disk inode for unused " "fallocated blocks, inode could be " "corrupted. Please run fsck"); } } } static void fat_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); } else fat_free_eofblocks(inode); invalidate_inode_buffers(inode); clear_inode(inode); fat_cache_inval_inode(inode); fat_detach(inode); } static void fat_set_state(struct super_block *sb, unsigned int set, unsigned int force) { struct buffer_head *bh; struct fat_boot_sector *b; struct msdos_sb_info *sbi = MSDOS_SB(sb); /* do not change any thing if mounted read only */ if (sb_rdonly(sb) && !force) return; /* do not change state if fs was dirty */ if (sbi->dirty) { /* warn only on set (mount). */ if (set) fat_msg(sb, KERN_WARNING, "Volume was not properly " "unmounted. Some data may be corrupt. " "Please run fsck."); return; } bh = sb_bread(sb, 0); if (bh == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector " "to mark fs as dirty"); return; } b = (struct fat_boot_sector *) bh->b_data; if (is_fat32(sbi)) { if (set) b->fat32.state |= FAT_STATE_DIRTY; else b->fat32.state &= ~FAT_STATE_DIRTY; } else /* fat 16 and 12 */ { if (set) b->fat16.state |= FAT_STATE_DIRTY; else b->fat16.state &= ~FAT_STATE_DIRTY; } mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); } static void fat_reset_iocharset(struct fat_mount_options *opts) { if (opts->iocharset != fat_default_iocharset) { /* Note: opts->iocharset can be NULL here */ kfree(opts->iocharset); opts->iocharset = fat_default_iocharset; } } static void delayed_free(struct rcu_head *p) { struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu); unload_nls(sbi->nls_disk); unload_nls(sbi->nls_io); fat_reset_iocharset(&sbi->options); kfree(sbi); } static void fat_put_super(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); fat_set_state(sb, 0, 0); iput(sbi->fsinfo_inode); iput(sbi->fat_inode); call_rcu(&sbi->rcu, delayed_free); } static struct kmem_cache *fat_inode_cachep; static struct inode *fat_alloc_inode(struct super_block *sb) { struct msdos_inode_info *ei; ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS); if (!ei) return NULL; init_rwsem(&ei->truncate_lock); /* Zeroing to allow iput() even if partial initialized inode. */ ei->mmu_private = 0; ei->i_start = 0; ei->i_logstart = 0; ei->i_attrs = 0; ei->i_pos = 0; return &ei->vfs_inode; } static void fat_free_inode(struct inode *inode) { kmem_cache_free(fat_inode_cachep, MSDOS_I(inode)); } static void init_once(void *foo) { struct msdos_inode_info *ei = (struct msdos_inode_info *)foo; spin_lock_init(&ei->cache_lru_lock); ei->nr_caches = 0; ei->cache_valid_id = FAT_CACHE_VALID + 1; INIT_LIST_HEAD(&ei->cache_lru); INIT_HLIST_NODE(&ei->i_fat_hash); INIT_HLIST_NODE(&ei->i_dir_hash); inode_init_once(&ei->vfs_inode); } static int __init fat_init_inodecache(void) { fat_inode_cachep = kmem_cache_create("fat_inode_cache", sizeof(struct msdos_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (fat_inode_cachep == NULL) return -ENOMEM; return 0; } static void __exit fat_destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(fat_inode_cachep); } static int fat_remount(struct super_block *sb, int *flags, char *data) { bool new_rdonly; struct msdos_sb_info *sbi = MSDOS_SB(sb); *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME); sync_filesystem(sb); /* make sure we update state on remount. */ new_rdonly = *flags & SB_RDONLY; if (new_rdonly != sb_rdonly(sb)) { if (new_rdonly) fat_set_state(sb, 0, 0); else fat_set_state(sb, 1, 1); } return 0; } static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); /* If the count of free cluster is still unknown, counts it here. */ if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { int err = fat_count_free_clusters(dentry->d_sb); if (err) return err; } buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = sbi->cluster_size; buf->f_blocks = sbi->max_cluster - FAT_START_ENT; buf->f_bfree = sbi->free_clusters; buf->f_bavail = sbi->free_clusters; buf->f_fsid = u64_to_fsid(id); buf->f_namelen = (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE; return 0; } static int __fat_write_inode(struct inode *inode, int wait) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct msdos_dir_entry *raw_entry; loff_t i_pos; sector_t blocknr; int err, offset; if (inode->i_ino == MSDOS_ROOT_INO) return 0; retry: i_pos = fat_i_pos_read(sbi, inode); if (!i_pos) return 0; fat_get_blknr_offset(sbi, i_pos, &blocknr, &offset); bh = sb_bread(sb, blocknr); if (!bh) { fat_msg(sb, KERN_ERR, "unable to read inode block " "for updating (i_pos %lld)", i_pos); return -EIO; } spin_lock(&sbi->inode_hash_lock); if (i_pos != MSDOS_I(inode)->i_pos) { spin_unlock(&sbi->inode_hash_lock); brelse(bh); goto retry; } raw_entry = &((struct msdos_dir_entry *) (bh->b_data))[offset]; if (S_ISDIR(inode->i_mode)) raw_entry->size = 0; else raw_entry->size = cpu_to_le32(inode->i_size); raw_entry->attr = fat_make_attrs(inode); fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart); fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time, &raw_entry->date, NULL); if (sbi->options.isvfat) { __le16 atime; fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, &raw_entry->cdate, &raw_entry->ctime_cs); fat_time_unix2fat(sbi, &inode->i_atime, &atime, &raw_entry->adate, NULL); } spin_unlock(&sbi->inode_hash_lock); mark_buffer_dirty(bh); err = 0; if (wait) err = sync_dirty_buffer(bh); brelse(bh); return err; } static int fat_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; if (inode->i_ino == MSDOS_FSINFO_INO) { struct super_block *sb = inode->i_sb; mutex_lock(&MSDOS_SB(sb)->s_lock); err = fat_clusters_flush(sb); mutex_unlock(&MSDOS_SB(sb)->s_lock); } else err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); return err; } int fat_sync_inode(struct inode *inode) { return __fat_write_inode(inode, 1); } EXPORT_SYMBOL_GPL(fat_sync_inode); static int fat_show_options(struct seq_file *m, struct dentry *root); static const struct super_operations fat_sops = { .alloc_inode = fat_alloc_inode, .free_inode = fat_free_inode, .write_inode = fat_write_inode, .evict_inode = fat_evict_inode, .put_super = fat_put_super, .statfs = fat_statfs, .remount_fs = fat_remount, .show_options = fat_show_options, }; static int fat_show_options(struct seq_file *m, struct dentry *root) { struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb); struct fat_mount_options *opts = &sbi->options; int isvfat = opts->isvfat; if (!uid_eq(opts->fs_uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, opts->fs_uid)); if (!gid_eq(opts->fs_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, opts->fs_gid)); seq_printf(m, ",fmask=%04o", opts->fs_fmask); seq_printf(m, ",dmask=%04o", opts->fs_dmask); if (opts->allow_utime) seq_printf(m, ",allow_utime=%04o", opts->allow_utime); if (sbi->nls_disk) /* strip "cp" prefix from displayed option */ seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]); if (isvfat) { if (sbi->nls_io) seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); switch (opts->shortname) { case VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95: seq_puts(m, ",shortname=win95"); break; case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT: seq_puts(m, ",shortname=winnt"); break; case VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95: seq_puts(m, ",shortname=mixed"); break; case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: seq_puts(m, ",shortname=lower"); break; default: seq_puts(m, ",shortname=unknown"); break; } } if (opts->name_check != 'n') seq_printf(m, ",check=%c", opts->name_check); if (opts->usefree) seq_puts(m, ",usefree"); if (opts->quiet) seq_puts(m, ",quiet"); if (opts->showexec) seq_puts(m, ",showexec"); if (opts->sys_immutable) seq_puts(m, ",sys_immutable"); if (!isvfat) { if (opts->dotsOK) seq_puts(m, ",dotsOK=yes"); if (opts->nocase) seq_puts(m, ",nocase"); } else { if (opts->utf8) seq_puts(m, ",utf8"); if (opts->unicode_xlate) seq_puts(m, ",uni_xlate"); if (!opts->numtail) seq_puts(m, ",nonumtail"); if (opts->rodir) seq_puts(m, ",rodir"); } if (opts->flush) seq_puts(m, ",flush"); if (opts->tz_set) { if (opts->time_offset) seq_printf(m, ",time_offset=%d", opts->time_offset); else seq_puts(m, ",tz=UTC"); } if (opts->errors == FAT_ERRORS_CONT) seq_puts(m, ",errors=continue"); else if (opts->errors == FAT_ERRORS_PANIC) seq_puts(m, ",errors=panic"); else seq_puts(m, ",errors=remount-ro"); if (opts->nfs == FAT_NFS_NOSTALE_RO) seq_puts(m, ",nfs=nostale_ro"); else if (opts->nfs) seq_puts(m, ",nfs=stale_rw"); if (opts->discard) seq_puts(m, ",discard"); if (opts->dos1xfloppy) seq_puts(m, ",dos1xfloppy"); return 0; } enum { Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage, Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug, Opt_immutable, Opt_dots, Opt_nodots, Opt_charset, Opt_shortname_lower, Opt_shortname_win95, Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset, Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy, }; static const match_table_t fat_tokens = { {Opt_check_r, "check=relaxed"}, {Opt_check_s, "check=strict"}, {Opt_check_n, "check=normal"}, {Opt_check_r, "check=r"}, {Opt_check_s, "check=s"}, {Opt_check_n, "check=n"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_umask, "umask=%o"}, {Opt_dmask, "dmask=%o"}, {Opt_fmask, "fmask=%o"}, {Opt_allow_utime, "allow_utime=%o"}, {Opt_codepage, "codepage=%u"}, {Opt_usefree, "usefree"}, {Opt_nocase, "nocase"}, {Opt_quiet, "quiet"}, {Opt_showexec, "showexec"}, {Opt_debug, "debug"}, {Opt_immutable, "sys_immutable"}, {Opt_flush, "flush"}, {Opt_tz_utc, "tz=UTC"}, {Opt_time_offset, "time_offset=%d"}, {Opt_err_cont, "errors=continue"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_discard, "discard"}, {Opt_nfs_stale_rw, "nfs"}, {Opt_nfs_stale_rw, "nfs=stale_rw"}, {Opt_nfs_nostale_ro, "nfs=nostale_ro"}, {Opt_dos1xfloppy, "dos1xfloppy"}, {Opt_obsolete, "conv=binary"}, {Opt_obsolete, "conv=text"}, {Opt_obsolete, "conv=auto"}, {Opt_obsolete, "conv=b"}, {Opt_obsolete, "conv=t"}, {Opt_obsolete, "conv=a"}, {Opt_obsolete, "fat=%u"}, {Opt_obsolete, "blocksize=%u"}, {Opt_obsolete, "cvf_format=%20s"}, {Opt_obsolete, "cvf_options=%100s"}, {Opt_obsolete, "posix"}, {Opt_err, NULL}, }; static const match_table_t msdos_tokens = { {Opt_nodots, "nodots"}, {Opt_nodots, "dotsOK=no"}, {Opt_dots, "dots"}, {Opt_dots, "dotsOK=yes"}, {Opt_err, NULL} }; static const match_table_t vfat_tokens = { {Opt_charset, "iocharset=%s"}, {Opt_shortname_lower, "shortname=lower"}, {Opt_shortname_win95, "shortname=win95"}, {Opt_shortname_winnt, "shortname=winnt"}, {Opt_shortname_mixed, "shortname=mixed"}, {Opt_utf8_no, "utf8=0"}, /* 0 or no or false */ {Opt_utf8_no, "utf8=no"}, {Opt_utf8_no, "utf8=false"}, {Opt_utf8_yes, "utf8=1"}, /* empty or 1 or yes or true */ {Opt_utf8_yes, "utf8=yes"}, {Opt_utf8_yes, "utf8=true"}, {Opt_utf8_yes, "utf8"}, {Opt_uni_xl_no, "uni_xlate=0"}, /* 0 or no or false */ {Opt_uni_xl_no, "uni_xlate=no"}, {Opt_uni_xl_no, "uni_xlate=false"}, {Opt_uni_xl_yes, "uni_xlate=1"}, /* empty or 1 or yes or true */ {Opt_uni_xl_yes, "uni_xlate=yes"}, {Opt_uni_xl_yes, "uni_xlate=true"}, {Opt_uni_xl_yes, "uni_xlate"}, {Opt_nonumtail_no, "nonumtail=0"}, /* 0 or no or false */ {Opt_nonumtail_no, "nonumtail=no"}, {Opt_nonumtail_no, "nonumtail=false"}, {Opt_nonumtail_yes, "nonumtail=1"}, /* empty or 1 or yes or true */ {Opt_nonumtail_yes, "nonumtail=yes"}, {Opt_nonumtail_yes, "nonumtail=true"}, {Opt_nonumtail_yes, "nonumtail"}, {Opt_rodir, "rodir"}, {Opt_err, NULL} }; static int parse_options(struct super_block *sb, char *options, int is_vfat, int silent, int *debug, struct fat_mount_options *opts) { char *p; substring_t args[MAX_OPT_ARGS]; int option; char *iocharset; opts->isvfat = is_vfat; opts->fs_uid = current_uid(); opts->fs_gid = current_gid(); opts->fs_fmask = opts->fs_dmask = current_umask(); opts->allow_utime = -1; opts->codepage = fat_default_codepage; fat_reset_iocharset(opts); if (is_vfat) { opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95; opts->rodir = 0; } else { opts->shortname = 0; opts->rodir = 1; } opts->name_check = 'n'; opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; opts->unicode_xlate = 0; opts->numtail = 1; opts->usefree = opts->nocase = 0; opts->tz_set = 0; opts->nfs = 0; opts->errors = FAT_ERRORS_RO; *debug = 0; opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat; if (!options) goto out; while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) continue; token = match_token(p, fat_tokens, args); if (token == Opt_err) { if (is_vfat) token = match_token(p, vfat_tokens, args); else token = match_token(p, msdos_tokens, args); } switch (token) { case Opt_check_s: opts->name_check = 's'; break; case Opt_check_r: opts->name_check = 'r'; break; case Opt_check_n: opts->name_check = 'n'; break; case Opt_usefree: opts->usefree = 1; break; case Opt_nocase: if (!is_vfat) opts->nocase = 1; else { /* for backward compatibility */ opts->shortname = VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95; } break; case Opt_quiet: opts->quiet = 1; break; case Opt_showexec: opts->showexec = 1; break; case Opt_debug: *debug = 1; break; case Opt_immutable: opts->sys_immutable = 1; break; case Opt_uid: if (match_int(&args[0], &option)) return -EINVAL; opts->fs_uid = make_kuid(current_user_ns(), option); if (!uid_valid(opts->fs_uid)) return -EINVAL; break; case Opt_gid: if (match_int(&args[0], &option)) return -EINVAL; opts->fs_gid = make_kgid(current_user_ns(), option); if (!gid_valid(opts->fs_gid)) return -EINVAL; break; case Opt_umask: if (match_octal(&args[0], &option)) return -EINVAL; opts->fs_fmask = opts->fs_dmask = option; break; case Opt_dmask: if (match_octal(&args[0], &option)) return -EINVAL; opts->fs_dmask = option; break; case Opt_fmask: if (match_octal(&args[0], &option)) return -EINVAL; opts->fs_fmask = option; break; case Opt_allow_utime: if (match_octal(&args[0], &option)) return -EINVAL; opts->allow_utime = option & (S_IWGRP | S_IWOTH); break; case Opt_codepage: if (match_int(&args[0], &option)) return -EINVAL; opts->codepage = option; break; case Opt_flush: opts->flush = 1; break; case Opt_time_offset: if (match_int(&args[0], &option)) return -EINVAL; /* * GMT+-12 zones may have DST corrections so at least * 13 hours difference is needed. Make the limit 24 * just in case someone invents something unusual. */ if (option < -24 * 60 || option > 24 * 60) return -EINVAL; opts->tz_set = 1; opts->time_offset = option; break; case Opt_tz_utc: opts->tz_set = 1; opts->time_offset = 0; break; case Opt_err_cont: opts->errors = FAT_ERRORS_CONT; break; case Opt_err_panic: opts->errors = FAT_ERRORS_PANIC; break; case Opt_err_ro: opts->errors = FAT_ERRORS_RO; break; case Opt_nfs_stale_rw: opts->nfs = FAT_NFS_STALE_RW; break; case Opt_nfs_nostale_ro: opts->nfs = FAT_NFS_NOSTALE_RO; break; case Opt_dos1xfloppy: opts->dos1xfloppy = 1; break; /* msdos specific */ case Opt_dots: opts->dotsOK = 1; break; case Opt_nodots: opts->dotsOK = 0; break; /* vfat specific */ case Opt_charset: fat_reset_iocharset(opts); iocharset = match_strdup(&args[0]); if (!iocharset) return -ENOMEM; opts->iocharset = iocharset; break; case Opt_shortname_lower: opts->shortname = VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95; break; case Opt_shortname_win95: opts->shortname = VFAT_SFN_DISPLAY_WIN95 | VFAT_SFN_CREATE_WIN95; break; case Opt_shortname_winnt: opts->shortname = VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WINNT; break; case Opt_shortname_mixed: opts->shortname = VFAT_SFN_DISPLAY_WINNT | VFAT_SFN_CREATE_WIN95; break; case Opt_utf8_no: /* 0 or no or false */ opts->utf8 = 0; break; case Opt_utf8_yes: /* empty or 1 or yes or true */ opts->utf8 = 1; break; case Opt_uni_xl_no: /* 0 or no or false */ opts->unicode_xlate = 0; break; case Opt_uni_xl_yes: /* empty or 1 or yes or true */ opts->unicode_xlate = 1; break; case Opt_nonumtail_no: /* 0 or no or false */ opts->numtail = 1; /* negated option */ break; case Opt_nonumtail_yes: /* empty or 1 or yes or true */ opts->numtail = 0; /* negated option */ break; case Opt_rodir: opts->rodir = 1; break; case Opt_discard: opts->discard = 1; break; /* obsolete mount options */ case Opt_obsolete: fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, " "not supported now", p); break; /* unknown option */ default: if (!silent) { fat_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " "or missing value", p); } return -EINVAL; } } out: /* UTF-8 doesn't provide FAT semantics */ if (!strcmp(opts->iocharset, "utf8")) { fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset" " for FAT filesystems, filesystem will be " "case sensitive!"); } /* If user doesn't specify allow_utime, it's initialized from dmask. */ if (opts->allow_utime == (unsigned short)-1) opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH); if (opts->unicode_xlate) opts->utf8 = 0; if (opts->nfs == FAT_NFS_NOSTALE_RO) { sb->s_flags |= SB_RDONLY; sb->s_export_op = &fat_export_ops_nostale; } return 0; } static int fat_read_root(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); inode->i_generation = 0; inode->i_mode = fat_make_mode(sbi, ATTR_DIR, S_IRWXUGO); inode->i_op = sbi->dir_ops; inode->i_fop = &fat_dir_operations; if (is_fat32(sbi)) { MSDOS_I(inode)->i_start = sbi->root_cluster; error = fat_calc_dir_size(inode); if (error < 0) return error; } else { MSDOS_I(inode)->i_start = 0; inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry); } inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; MSDOS_I(inode)->i_logstart = 0; MSDOS_I(inode)->mmu_private = inode->i_size; fat_save_attrs(inode, ATTR_DIR); inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; set_nlink(inode, fat_subdirs(inode)+2); return 0; } static unsigned long calc_fat_clusters(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); /* Divide first to avoid overflow */ if (!is_fat12(sbi)) { unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits; return ent_per_sec * sbi->fat_length; } return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; } static bool fat_bpb_is_zero(struct fat_boot_sector *b) { if (get_unaligned_le16(&b->sector_size)) return false; if (b->sec_per_clus) return false; if (b->reserved) return false; if (b->fats) return false; if (get_unaligned_le16(&b->dir_entries)) return false; if (get_unaligned_le16(&b->sectors)) return false; if (b->media) return false; if (b->fat_length) return false; if (b->secs_track) return false; if (b->heads) return false; return true; } static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b, int silent, struct fat_bios_param_block *bpb) { int error = -EINVAL; /* Read in BPB ... */ memset(bpb, 0, sizeof(*bpb)); bpb->fat_sector_size = get_unaligned_le16(&b->sector_size); bpb->fat_sec_per_clus = b->sec_per_clus; bpb->fat_reserved = le16_to_cpu(b->reserved); bpb->fat_fats = b->fats; bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries); bpb->fat_sectors = get_unaligned_le16(&b->sectors); bpb->fat_fat_length = le16_to_cpu(b->fat_length); bpb->fat_total_sect = le32_to_cpu(b->total_sect); bpb->fat16_state = b->fat16.state; bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id); bpb->fat32_length = le32_to_cpu(b->fat32.length); bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster); bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector); bpb->fat32_state = b->fat32.state; bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id); /* Validate this looks like a FAT filesystem BPB */ if (!bpb->fat_reserved) { if (!silent) fat_msg(sb, KERN_ERR, "bogus number of reserved sectors"); goto out; } if (!bpb->fat_fats) { if (!silent) fat_msg(sb, KERN_ERR, "bogus number of FAT structure"); goto out; } /* * Earlier we checked here that b->secs_track and b->head are nonzero, * but it turns out valid FAT filesystems can have zero there. */ if (!fat_valid_media(b->media)) { if (!silent) fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)", (unsigned)b->media); goto out; } if (!is_power_of_2(bpb->fat_sector_size) || (bpb->fat_sector_size < 512) || (bpb->fat_sector_size > 4096)) { if (!silent) fat_msg(sb, KERN_ERR, "bogus logical sector size %u", (unsigned)bpb->fat_sector_size); goto out; } if (!is_power_of_2(bpb->fat_sec_per_clus)) { if (!silent) fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u", (unsigned)bpb->fat_sec_per_clus); goto out; } if (bpb->fat_fat_length == 0 && bpb->fat32_length == 0) { if (!silent) fat_msg(sb, KERN_ERR, "bogus number of FAT sectors"); goto out; } error = 0; out: return error; } static int fat_read_static_bpb(struct super_block *sb, struct fat_boot_sector *b, int silent, struct fat_bios_param_block *bpb) { static const char *notdos1x = "This doesn't look like a DOS 1.x volume"; struct fat_floppy_defaults *fdefaults = NULL; int error = -EINVAL; sector_t bd_sects; unsigned i; bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE; /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */ if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) { if (!silent) fat_msg(sb, KERN_ERR, "%s; no bootstrapping code", notdos1x); goto out; } /* * If any value in this region is non-zero, it isn't archaic * DOS. */ if (!fat_bpb_is_zero(b)) { if (!silent) fat_msg(sb, KERN_ERR, "%s; DOS 2.x BPB is non-zero", notdos1x); goto out; } for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) { if (floppy_defaults[i].nr_sectors == bd_sects) { fdefaults = &floppy_defaults[i]; break; } } if (fdefaults == NULL) { if (!silent) fat_msg(sb, KERN_WARNING, "This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)", (u64)bd_sects); goto out; } if (!silent) fat_msg(sb, KERN_INFO, "This looks like a DOS 1.x volume; assuming default BPB values"); memset(bpb, 0, sizeof(*bpb)); bpb->fat_sector_size = SECTOR_SIZE; bpb->fat_sec_per_clus = fdefaults->sec_per_clus; bpb->fat_reserved = 1; bpb->fat_fats = 2; bpb->fat_dir_entries = fdefaults->dir_entries; bpb->fat_sectors = fdefaults->nr_sectors; bpb->fat_fat_length = fdefaults->fat_length; error = 0; out: return error; } /* * Read the super block of an MS-DOS FS. */ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, void (*setup)(struct super_block *)) { struct inode *root_inode = NULL, *fat_inode = NULL; struct inode *fsinfo_inode = NULL; struct buffer_head *bh; struct fat_bios_param_block bpb; struct msdos_sb_info *sbi; u16 logical_sector_size; u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors; int debug; long error; char buf[50]; struct timespec64 ts; /* * GFP_KERNEL is ok here, because while we do hold the * superblock lock, memory pressure can't call back into * the filesystem, since we're only just about to mount * it and have no inodes etc active! */ sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; sb->s_flags |= SB_NODIRATIME; sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; /* * fat timestamps are complex and truncated by fat itself, so * we set 1 here to be fast */ sb->s_time_gran = 1; mutex_init(&sbi->nfs_build_inode_lock); ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options); if (error) goto out_fail; setup(sb); /* flavour-specific stuff that needs options */ error = -EIO; sb_min_blocksize(sb, 512); bh = sb_bread(sb, 0); if (bh == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector"); goto out_fail; } error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent, &bpb); if (error == -EINVAL && sbi->options.dos1xfloppy) error = fat_read_static_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent, &bpb); brelse(bh); if (error == -EINVAL) goto out_invalid; else if (error) goto out_fail; logical_sector_size = bpb.fat_sector_size; sbi->sec_per_clus = bpb.fat_sec_per_clus; error = -EIO; if (logical_sector_size < sb->s_blocksize) { fat_msg(sb, KERN_ERR, "logical sector size too small for device" " (logical sector size = %u)", logical_sector_size); goto out_fail; } if (logical_sector_size > sb->s_blocksize) { struct buffer_head *bh_resize; if (!sb_set_blocksize(sb, logical_sector_size)) { fat_msg(sb, KERN_ERR, "unable to set blocksize %u", logical_sector_size); goto out_fail; } /* Verify that the larger boot sector is fully readable */ bh_resize = sb_bread(sb, 0); if (bh_resize == NULL) { fat_msg(sb, KERN_ERR, "unable to read boot sector" " (logical sector size = %lu)", sb->s_blocksize); goto out_fail; } brelse(bh_resize); } mutex_init(&sbi->s_lock); sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus; sbi->cluster_bits = ffs(sbi->cluster_size) - 1; sbi->fats = bpb.fat_fats; sbi->fat_bits = 0; /* Don't know yet */ sbi->fat_start = bpb.fat_reserved; sbi->fat_length = bpb.fat_fat_length; sbi->root_cluster = 0; sbi->free_clusters = -1; /* Don't know yet */ sbi->free_clus_valid = 0; sbi->prev_free = FAT_START_ENT; sb->s_maxbytes = 0xffffffff; fat_time_fat2unix(sbi, &ts, 0, cpu_to_le16(FAT_DATE_MIN), 0); sb->s_time_min = ts.tv_sec; fat_time_fat2unix(sbi, &ts, cpu_to_le16(FAT_TIME_MAX), cpu_to_le16(FAT_DATE_MAX), 0); sb->s_time_max = ts.tv_sec; if (!sbi->fat_length && bpb.fat32_length) { struct fat_boot_fsinfo *fsinfo; struct buffer_head *fsinfo_bh; /* Must be FAT32 */ sbi->fat_bits = 32; sbi->fat_length = bpb.fat32_length; sbi->root_cluster = bpb.fat32_root_cluster; /* MC - if info_sector is 0, don't multiply by 0 */ sbi->fsinfo_sector = bpb.fat32_info_sector; if (sbi->fsinfo_sector == 0) sbi->fsinfo_sector = 1; fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector); if (fsinfo_bh == NULL) { fat_msg(sb, KERN_ERR, "bread failed, FSINFO block" " (sector = %lu)", sbi->fsinfo_sector); goto out_fail; } fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data; if (!IS_FSINFO(fsinfo)) { fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: " "0x%08x, 0x%08x (sector = %lu)", le32_to_cpu(fsinfo->signature1), le32_to_cpu(fsinfo->signature2), sbi->fsinfo_sector); } else { if (sbi->options.usefree) sbi->free_clus_valid = 1; sbi->free_clusters = le32_to_cpu(fsinfo->free_clusters); sbi->prev_free = le32_to_cpu(fsinfo->next_cluster); } brelse(fsinfo_bh); } /* interpret volume ID as a little endian 32 bit integer */ if (is_fat32(sbi)) sbi->vol_id = bpb.fat32_vol_id; else /* fat 16 or 12 */ sbi->vol_id = bpb.fat16_vol_id; sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length; sbi->dir_entries = bpb.fat_dir_entries; if (sbi->dir_entries & (sbi->dir_per_block - 1)) { if (!silent) fat_msg(sb, KERN_ERR, "bogus number of directory entries" " (%u)", sbi->dir_entries); goto out_invalid; } rootdir_sectors = sbi->dir_entries * sizeof(struct msdos_dir_entry) / sb->s_blocksize; sbi->data_start = sbi->dir_start + rootdir_sectors; total_sectors = bpb.fat_sectors; if (total_sectors == 0) total_sectors = bpb.fat_total_sect; total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus; if (!is_fat32(sbi)) sbi->fat_bits = (total_clusters > MAX_FAT12) ? 16 : 12; /* some OSes set FAT_STATE_DIRTY and clean it on unmount. */ if (is_fat32(sbi)) sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY; else /* fat 16 or 12 */ sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY; /* check that FAT table does not overflow */ fat_clusters = calc_fat_clusters(sb); total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); if (total_clusters > max_fat(sb)) { if (!silent) fat_msg(sb, KERN_ERR, "count of clusters too big (%u)", total_clusters); goto out_invalid; } sbi->max_cluster = total_clusters + FAT_START_ENT; /* check the free_clusters, it's not necessarily correct */ if (sbi->free_clusters != -1 && sbi->free_clusters > total_clusters) sbi->free_clusters = -1; /* check the prev_free, it's not necessarily correct */ sbi->prev_free %= sbi->max_cluster; if (sbi->prev_free < FAT_START_ENT) sbi->prev_free = FAT_START_ENT; /* set up enough so that it can read an inode */ fat_hash_init(sb); dir_hash_init(sb); fat_ent_access_init(sb); /* * The low byte of the first FAT entry must have the same value as * the media field of the boot sector. But in real world, too many * devices are writing wrong values. So, removed that validity check. * * The removed check compared the first FAT entry to a value dependent * on the media field like this: * == (0x0F00 | media), for FAT12 * == (0XFF00 | media), for FAT16 * == (0x0FFFFF | media), for FAT32 */ error = -EINVAL; sprintf(buf, "cp%d", sbi->options.codepage); sbi->nls_disk = load_nls(buf); if (!sbi->nls_disk) { fat_msg(sb, KERN_ERR, "codepage %s not found", buf); goto out_fail; } /* FIXME: utf8 is using iocharset for upper/lower conversion */ if (sbi->options.isvfat) { sbi->nls_io = load_nls(sbi->options.iocharset); if (!sbi->nls_io) { fat_msg(sb, KERN_ERR, "IO charset %s not found", sbi->options.iocharset); goto out_fail; } } error = -ENOMEM; fat_inode = new_inode(sb); if (!fat_inode) goto out_fail; sbi->fat_inode = fat_inode; fsinfo_inode = new_inode(sb); if (!fsinfo_inode) goto out_fail; fsinfo_inode->i_ino = MSDOS_FSINFO_INO; sbi->fsinfo_inode = fsinfo_inode; insert_inode_hash(fsinfo_inode); root_inode = new_inode(sb); if (!root_inode) goto out_fail; root_inode->i_ino = MSDOS_ROOT_INO; inode_set_iversion(root_inode, 1); error = fat_read_root(root_inode); if (error < 0) { iput(root_inode); goto out_fail; } error = -ENOMEM; insert_inode_hash(root_inode); fat_attach(root_inode, 0); sb->s_root = d_make_root(root_inode); if (!sb->s_root) { fat_msg(sb, KERN_ERR, "get root inode failed"); goto out_fail; } if (sbi->options.discard) { struct request_queue *q = bdev_get_queue(sb->s_bdev); if (!blk_queue_discard(q)) fat_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); } fat_set_state(sb, 1, 0); return 0; out_invalid: error = -EINVAL; if (!silent) fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem"); out_fail: if (fsinfo_inode) iput(fsinfo_inode); if (fat_inode) iput(fat_inode); unload_nls(sbi->nls_io); unload_nls(sbi->nls_disk); fat_reset_iocharset(&sbi->options); sb->s_fs_info = NULL; kfree(sbi); return error; } EXPORT_SYMBOL_GPL(fat_fill_super); /* * helper function for fat_flush_inodes. This writes both the inode * and the file data blocks, waiting for in flight data blocks before * the start of the call. It does not wait for any io started * during the call */ static int writeback_inode(struct inode *inode) { int ret; /* if we used wait=1, sync_inode_metadata waits for the io for the * inode to finish. So wait=0 is sent down to sync_inode_metadata * and filemap_fdatawrite is used for the data blocks */ ret = sync_inode_metadata(inode, 0); if (!ret) ret = filemap_fdatawrite(inode->i_mapping); return ret; } /* * write data and metadata corresponding to i1 and i2. The io is * started but we do not wait for any of it to finish. * * filemap_flush is used for the block device, so if there is a dirty * page for a block already in flight, we will not wait and start the * io over again */ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2) { int ret = 0; if (!MSDOS_SB(sb)->options.flush) return 0; if (i1) ret = writeback_inode(i1); if (!ret && i2) ret = writeback_inode(i2); if (!ret) { struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; ret = filemap_flush(mapping); } return ret; } EXPORT_SYMBOL_GPL(fat_flush_inodes); static int __init init_fat_fs(void) { int err; err = fat_cache_init(); if (err) return err; err = fat_init_inodecache(); if (err) goto failed; return 0; failed: fat_cache_destroy(); return err; } static void __exit exit_fat_fs(void) { fat_cache_destroy(); fat_destroy_inodecache(); } module_init(init_fat_fs) module_exit(exit_fat_fs) MODULE_LICENSE("GPL");
28 66 275 293 71 70 1 1 1 11449 5 192 12 1 125 6131 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_MM_H #define _LINUX_SCHED_MM_H #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/gfp.h> #include <linux/sync_core.h> /* * Routines for handling mm_structs */ extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. * @mm: The &struct mm_struct to pin. * * Make sure that @mm will not get freed even after the owning task * exits. This doesn't guarantee that the associated address space * will still exist later on and mmget_not_zero() has to be used before * accessing it. * * This is a preferred way to pin @mm for a longer/unbounded amount * of time. * * Use mmdrop() to release the reference acquired by mmgrab(). * * See also <Documentation/vm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) { atomic_inc(&mm->mm_count); } extern void __mmdrop(struct mm_struct *mm); static inline void mmdrop(struct mm_struct *mm) { /* * The implicit full barrier implied by atomic_dec_and_test() is * required by the membarrier system call before returning to * user-space, after storing to rq->curr. */ if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. * * Make sure that the address space of the given &struct mm_struct doesn't * go away. This does not protect against parts of the address space being * modified or freed, however. * * Never use this function to pin this address space for an * unbounded/indefinite amount of time. * * Use mmput() to release the reference acquired by mmget(). * * See also <Documentation/vm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) { atomic_inc(&mm->mm_users); } static inline bool mmget_not_zero(struct mm_struct *mm) { return atomic_inc_not_zero(&mm->mm_users); } /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); #ifdef CONFIG_MMU /* same as above but performs the slow path from the async context. Can * be called from the atomic context as well */ void mmput_async(struct mm_struct *); #endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); /* * Grab a reference to a task's mm, if it is not already going away * and ptrace_may_access with the mode parameter passed to it * succeeds. */ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct on exit() */ extern void exit_mm_release(struct task_struct *, struct mm_struct *); /* Remove the current tasks stale references to the old mm_struct on exec() */ extern void exec_mm_release(struct task_struct *, struct mm_struct *); #ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU #ifndef arch_get_mmap_end #define arch_get_mmap_end(addr) (TASK_SIZE) #endif #ifndef arch_get_mmap_base #define arch_get_mmap_base(addr, base) (base) #endif extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) { bool ret; /* * need RCU to access ->real_parent if CLONE_VM was used along with * CLONE_PARENT. * * We check real_parent->mm == tsk->mm because CLONE_VFORK does not * imply CLONE_VM * * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus * ->real_parent is not necessarily the task doing vfork(), so in * theory we can't rely on task_lock() if we want to dereference it. * * And in this case we can't trust the real_parent->mm == tsk->mm * check, it can be false negative. But we do not care, if init or * another oom-unkillable task does this it should blame itself. */ rcu_read_lock(); ret = tsk->vfork_done && rcu_dereference(tsk->real_parent)->mm == tsk->mm; rcu_read_unlock(); return ret; } /* * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS * PF_MEMALLOC_PIN implies !GFP_MOVABLE */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence */ if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE; } return flags; } #ifdef CONFIG_LOCKDEP extern void __fs_reclaim_acquire(unsigned long ip); extern void __fs_reclaim_release(unsigned long ip); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else static inline void __fs_reclaim_acquire(unsigned long ip) { } static inline void __fs_reclaim_release(unsigned long ip) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif /** * might_alloc - Mark possible allocation sites * @gfp_mask: gfp_t flags that would be used to allocate * * Similar to might_sleep() and other annotations, this can be used in functions * that might allocate, but often don't. Compiles to nothing without * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking. */ static inline void might_alloc(gfp_t gfp_mask) { fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } /** * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. * * This functions marks the beginning of the GFP_NOIO allocation scope. * All further allocations will implicitly drop __GFP_IO flag and so * they are safe for the IO critical section from the allocation recursion * point of view. Use memalloc_noio_restore to end the scope with flags * returned by this function. * * This function is safe to be used from any context. */ static inline unsigned int memalloc_noio_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_NOIO; current->flags |= PF_MEMALLOC_NOIO; return flags; } /** * memalloc_noio_restore - Ends the implicit GFP_NOIO scope. * @flags: Flags to restore. * * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function. * Always make sure that the given flags is the return value from the * pairing memalloc_noio_save call. */ static inline void memalloc_noio_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; } /** * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope. * * This functions marks the beginning of the GFP_NOFS allocation scope. * All further allocations will implicitly drop __GFP_FS flag and so * they are safe for the FS critical section from the allocation recursion * point of view. Use memalloc_nofs_restore to end the scope with flags * returned by this function. * * This function is safe to be used from any context. */ static inline unsigned int memalloc_nofs_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_NOFS; current->flags |= PF_MEMALLOC_NOFS; return flags; } /** * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope. * @flags: Flags to restore. * * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function. * Always make sure that the given flags is the return value from the * pairing memalloc_nofs_save call. */ static inline void memalloc_nofs_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; } static inline unsigned int memalloc_noreclaim_save(void) { unsigned int flags = current->flags & PF_MEMALLOC; current->flags |= PF_MEMALLOC; return flags; } static inline void memalloc_noreclaim_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC) | flags; } static inline unsigned int memalloc_pin_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_PIN; current->flags |= PF_MEMALLOC_PIN; return flags; } static inline void memalloc_pin_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; } #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); /** * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. * * This function marks the beginning of the remote memcg charging scope. All the * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * * NOTE: This function can nest. Users must save the return value and * reset the previous value after their own charging scope is over. */ static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { struct mem_cgroup *old; if (!in_task()) { old = this_cpu_read(int_active_memcg); this_cpu_write(int_active_memcg, memcg); } else { old = current->active_memcg; current->active_memcg = memcg; } return old; } #else static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { return NULL; } #endif #ifdef CONFIG_MEMBARRIER enum { MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1), MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2), MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6), MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7), }; enum { MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), MEMBARRIER_FLAG_RSEQ = (1U << 1), }; #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS #include <asm/membarrier.h> #endif static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) return; sync_core_before_usermode(); } extern void membarrier_exec_mmap(struct mm_struct *mm); extern void membarrier_update_current_mm(struct mm_struct *next_mm); #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { } #endif static inline void membarrier_exec_mmap(struct mm_struct *mm) { } static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { } static inline void membarrier_update_current_mm(struct mm_struct *next_mm) { } #endif #endif /* _LINUX_SCHED_MM_H */
117 15 15 15 551 439 117 117 117 116 5 1 2 2 2 11 1 10 11 11 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 // SPDX-License-Identifier: GPL-2.0 #include "cgroup-internal.h" #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> /* cgroup namespaces */ static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); } static void dec_cgroup_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); } static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); return ERR_PTR(ret); } refcount_set(&new_ns->ns.count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } EXPORT_SYMBOL(free_cgroup_ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); if (!(flags & CLONE_NEWCGROUP)) { get_cgroup_ns(old_ns); return old_ns; } /* Allow only sysadmin to create cgroup namespace. */ if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); ucounts = inc_cgroup_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); spin_unlock_irq(&css_set_lock); new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; } static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) { return container_of(ns, struct cgroup_namespace, ns); } static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Don't need to do anything if we are attaching to our own cgroupns. */ if (cgroup_ns == nsproxy->cgroup_ns) return 0; get_cgroup_ns(cgroup_ns); put_cgroup_ns(nsproxy->cgroup_ns); nsproxy->cgroup_ns = cgroup_ns; return 0; } static struct ns_common *cgroupns_get(struct task_struct *task) { struct cgroup_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->cgroup_ns; get_cgroup_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void cgroupns_put(struct ns_common *ns) { put_cgroup_ns(to_cg_ns(ns)); } static struct user_namespace *cgroupns_owner(struct ns_common *ns) { return to_cg_ns(ns)->user_ns; } const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, .owner = cgroupns_owner, };
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 // SPDX-License-Identifier: GPL-2.0+ /* * PlayStation 2 Trance Vibrator driver * * Copyright (C) 2006 Sam Hocevar <sam@zoy.org> */ /* Standard include files */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/usb.h> #define DRIVER_AUTHOR "Sam Hocevar, sam@zoy.org" #define DRIVER_DESC "PlayStation 2 Trance Vibrator driver" #define TRANCEVIBRATOR_VENDOR_ID 0x0b49 /* ASCII Corporation */ #define TRANCEVIBRATOR_PRODUCT_ID 0x064f /* Trance Vibrator */ static const struct usb_device_id id_table[] = { { USB_DEVICE(TRANCEVIBRATOR_VENDOR_ID, TRANCEVIBRATOR_PRODUCT_ID) }, { }, }; MODULE_DEVICE_TABLE (usb, id_table); /* Driver-local specific stuff */ struct trancevibrator { struct usb_device *udev; unsigned int speed; }; static ssize_t speed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct trancevibrator *tv = usb_get_intfdata(intf); return sprintf(buf, "%d\n", tv->speed); } static ssize_t speed_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_interface *intf = to_usb_interface(dev); struct trancevibrator *tv = usb_get_intfdata(intf); int temp, retval, old; retval = kstrtoint(buf, 10, &temp); if (retval) return retval; if (temp > 255) temp = 255; else if (temp < 0) temp = 0; old = tv->speed; tv->speed = temp; dev_dbg(&tv->udev->dev, "speed = %d\n", tv->speed); /* Set speed */ retval = usb_control_msg(tv->udev, usb_sndctrlpipe(tv->udev, 0), 0x01, /* vendor request: set speed */ USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, tv->speed, /* speed value */ 0, NULL, 0, USB_CTRL_SET_TIMEOUT); if (retval) { tv->speed = old; dev_dbg(&tv->udev->dev, "retval = %d\n", retval); return retval; } return count; } static DEVICE_ATTR_RW(speed); static struct attribute *tv_attrs[] = { &dev_attr_speed.attr, NULL, }; ATTRIBUTE_GROUPS(tv); static int tv_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(interface); struct trancevibrator *dev; int retval; dev = kzalloc(sizeof(struct trancevibrator), GFP_KERNEL); if (!dev) { retval = -ENOMEM; goto error; } dev->udev = usb_get_dev(udev); usb_set_intfdata(interface, dev); return 0; error: kfree(dev); return retval; } static void tv_disconnect(struct usb_interface *interface) { struct trancevibrator *dev; dev = usb_get_intfdata (interface); usb_set_intfdata(interface, NULL); usb_put_dev(dev->udev); kfree(dev); } /* USB subsystem object */ static struct usb_driver tv_driver = { .name = "trancevibrator", .probe = tv_probe, .disconnect = tv_disconnect, .id_table = id_table, .dev_groups = tv_groups, }; module_usb_driver(tv_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
554 75 75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 #include <linux/notifier.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> #include <net/netns/ipv6.h> #include <net/ip6_fib.h> int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; return call_fib_notifier(nb, event_type, info); } int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { info->family = AF_INET6; return call_fib_notifiers(net, event_type, info); } static unsigned int fib6_seq_read(struct net *net) { return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } static int fib6_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; err = fib6_rules_dump(net, nb, extack); if (err) return err; return fib6_tables_dump(net, nb, extack); } static const struct fib_notifier_ops fib6_notifier_ops_template = { .family = AF_INET6, .fib_seq_read = fib6_seq_read, .fib_dump = fib6_dump, .owner = THIS_MODULE, }; int __net_init fib6_notifier_init(struct net *net) { struct fib_notifier_ops *ops; ops = fib_notifier_ops_register(&fib6_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.notifier_ops = ops; return 0; } void __net_exit fib6_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.notifier_ops); }
187 38 186 187 166 186 187 187 187 187 187 187 186 187 187 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/hash.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ * * Portions of this code from linux/fs/ext3/hash.c * * Copyright (C) 2002 by Theodore Ts'o */ #include <linux/types.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/pagemap.h> #include <linux/unicode.h> #include "f2fs.h" /* * Hashing code copied from ext3 */ #define DELTA 0x9E3779B9 static void TEA_transform(unsigned int buf[4], unsigned int const in[]) { __u32 sum = 0; __u32 b0 = buf[0], b1 = buf[1]; __u32 a = in[0], b = in[1], c = in[2], d = in[3]; int n = 16; do { sum += DELTA; b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); } while (--n); buf[0] += b0; buf[1] += b1; } static void str2hashbuf(const unsigned char *msg, size_t len, unsigned int *buf, int num) { unsigned pad, val; int i; pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; val = pad; if (len > num * 4) len = num * 4; for (i = 0; i < len; i++) { if ((i % 4) == 0) val = pad; val = msg[i] + (val << 8); if ((i % 4) == 3) { *buf++ = val; val = pad; num--; } } if (--num >= 0) *buf++ = val; while (--num >= 0) *buf++ = pad; } static u32 TEA_hash_name(const u8 *p, size_t len) { __u32 in[8], buf[4]; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; buf[1] = 0xefcdab89; buf[2] = 0x98badcfe; buf[3] = 0x10325476; while (1) { str2hashbuf(p, len, in, 4); TEA_transform(buf, in); p += 16; if (len <= 16) break; len -= 16; } return buf[0] & ~F2FS_HASH_COL_BIT; } /* * Compute @fname->hash. For all directories, @fname->disk_name must be set. * For casefolded directories, @fname->usr_fname must be set, and also * @fname->cf_name if the filename is valid Unicode and is not "." or "..". */ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) { const u8 *name = fname->disk_name.name; size_t len = fname->disk_name.len; WARN_ON_ONCE(!name); if (is_dot_dotdot(name, len)) { fname->hash = 0; return; } #ifdef CONFIG_UNICODE if (IS_CASEFOLDED(dir)) { /* * If the casefolded name is provided, hash it instead of the * on-disk name. If the casefolded name is *not* provided, that * should only be because the name wasn't valid Unicode or was * "." or "..", so fall back to treating the name as an opaque * byte sequence. Note that to handle encrypted directories, * the fallback must use usr_fname (plaintext) rather than * disk_name (ciphertext). */ WARN_ON_ONCE(!fname->usr_fname->name); if (fname->cf_name.name) { name = fname->cf_name.name; len = fname->cf_name.len; } else { name = fname->usr_fname->name; len = fname->usr_fname->len; } if (IS_ENCRYPTED(dir)) { struct qstr tmp = QSTR_INIT(name, len); fname->hash = cpu_to_le32(fscrypt_fname_siphash(dir, &tmp)); return; } } #endif fname->hash = cpu_to_le32(TEA_hash_name(name, len)); }
48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark); u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming); int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message); struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); void batadv_tt_global_entry_release(struct kref *ref); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, const u8 *src, const u8 *addr, unsigned short vid); void batadv_tt_free(struct batadv_priv *bat_priv); bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, unsigned short vid); void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv); bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface); bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid); bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); int batadv_tt_cache_init(void); void batadv_tt_cache_destroy(void); /** * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and * possibly release it * @tt_global_entry: tt_global_entry to be free'd */ static inline void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) { if (!tt_global_entry) return; kref_put(&tt_global_entry->common.refcount, batadv_tt_global_entry_release); } #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
1 10 3 1 6 1 4 6 1 6 1 6 13 13 1 1 4 2 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 // SPDX-License-Identifier: GPL-2.0-only /* * Optimized MPEG FS - inode and super operations. * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com> */ #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/vfs.h> #include <linux/cred.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/vmalloc.h> #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/crc-itu-t.h> #include "omfs.h" MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>"); MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux"); MODULE_LICENSE("GPL"); struct buffer_head *omfs_bread(struct super_block *sb, sector_t block) { struct omfs_sb_info *sbi = OMFS_SB(sb); if (block >= sbi->s_num_blocks) return NULL; return sb_bread(sb, clus_to_blk(sbi, block)); } struct inode *omfs_new_inode(struct inode *dir, umode_t mode) { struct inode *inode; u64 new_block; int err; int len; struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb); inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors, &new_block, &len); if (err) goto fail; inode->i_ino = new_block; inode_init_owner(&init_user_ns, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_op = &omfs_dir_inops; inode->i_fop = &omfs_dir_operations; inode->i_size = sbi->s_sys_blocksize; inc_nlink(inode); break; case S_IFREG: inode->i_op = &omfs_file_inops; inode->i_fop = &omfs_file_operations; inode->i_size = 0; break; } insert_inode_hash(inode); mark_inode_dirty(inode); return inode; fail: make_bad_inode(inode); iput(inode); return ERR_PTR(err); } /* * Update the header checksums for a dirty inode based on its contents. * Caller is expected to hold the buffer head underlying oi and mark it * dirty. */ static void omfs_update_checksums(struct omfs_inode *oi) { int xor, i, ofs = 0, count; u16 crc = 0; unsigned char *ptr = (unsigned char *) oi; count = be32_to_cpu(oi->i_head.h_body_size); ofs = sizeof(struct omfs_header); crc = crc_itu_t(crc, ptr + ofs, count); oi->i_head.h_crc = cpu_to_be16(crc); xor = ptr[0]; for (i = 1; i < OMFS_XOR_COUNT; i++) xor ^= ptr[i]; oi->i_head.h_check_xor = xor; } static int __omfs_write_inode(struct inode *inode, int wait) { struct omfs_inode *oi; struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); struct buffer_head *bh, *bh2; u64 ctime; int i; int ret = -EIO; int sync_failed = 0; /* get current inode since we may have written sibling ptrs etc. */ bh = omfs_bread(inode->i_sb, inode->i_ino); if (!bh) goto out; oi = (struct omfs_inode *) bh->b_data; oi->i_head.h_self = cpu_to_be64(inode->i_ino); if (S_ISDIR(inode->i_mode)) oi->i_type = OMFS_DIR; else if (S_ISREG(inode->i_mode)) oi->i_type = OMFS_FILE; else { printk(KERN_WARNING "omfs: unknown file type: %d\n", inode->i_mode); goto out_brelse; } oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize - sizeof(struct omfs_header)); oi->i_head.h_version = 1; oi->i_head.h_type = OMFS_INODE_NORMAL; oi->i_head.h_magic = OMFS_IMAGIC; oi->i_size = cpu_to_be64(inode->i_size); ctime = inode->i_ctime.tv_sec * 1000LL + ((inode->i_ctime.tv_nsec + 999)/1000); oi->i_ctime = cpu_to_be64(ctime); omfs_update_checksums(oi); mark_buffer_dirty(bh); if (wait) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) sync_failed = 1; } /* if mirroring writes, copy to next fsblock */ for (i = 1; i < sbi->s_mirrors; i++) { bh2 = omfs_bread(inode->i_sb, inode->i_ino + i); if (!bh2) goto out_brelse; memcpy(bh2->b_data, bh->b_data, bh->b_size); mark_buffer_dirty(bh2); if (wait) { sync_dirty_buffer(bh2); if (buffer_req(bh2) && !buffer_uptodate(bh2)) sync_failed = 1; } brelse(bh2); } ret = (sync_failed) ? -EIO : 0; out_brelse: brelse(bh); out: return ret; } static int omfs_write_inode(struct inode *inode, struct writeback_control *wbc) { return __omfs_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } int omfs_sync_inode(struct inode *inode) { return __omfs_write_inode(inode, 1); } /* * called when an entry is deleted, need to clear the bits in the * bitmaps. */ static void omfs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_nlink) return; if (S_ISREG(inode->i_mode)) { inode->i_size = 0; omfs_shrink_inode(inode); } omfs_clear_range(inode->i_sb, inode->i_ino, 2); } struct inode *omfs_iget(struct super_block *sb, ino_t ino) { struct omfs_sb_info *sbi = OMFS_SB(sb); struct omfs_inode *oi; struct buffer_head *bh; u64 ctime; unsigned long nsecs; struct inode *inode; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; bh = omfs_bread(inode->i_sb, ino); if (!bh) goto iget_failed; oi = (struct omfs_inode *)bh->b_data; /* check self */ if (ino != be64_to_cpu(oi->i_head.h_self)) goto fail_bh; inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; ctime = be64_to_cpu(oi->i_ctime); nsecs = do_div(ctime, 1000) * 1000L; inode->i_atime.tv_sec = ctime; inode->i_mtime.tv_sec = ctime; inode->i_ctime.tv_sec = ctime; inode->i_atime.tv_nsec = nsecs; inode->i_mtime.tv_nsec = nsecs; inode->i_ctime.tv_nsec = nsecs; inode->i_mapping->a_ops = &omfs_aops; switch (oi->i_type) { case OMFS_DIR: inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask); inode->i_op = &omfs_dir_inops; inode->i_fop = &omfs_dir_operations; inode->i_size = sbi->s_sys_blocksize; inc_nlink(inode); break; case OMFS_FILE: inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask); inode->i_fop = &omfs_file_operations; inode->i_size = be64_to_cpu(oi->i_size); break; } brelse(bh); unlock_new_inode(inode); return inode; fail_bh: brelse(bh); iget_failed: iget_failed(inode); return ERR_PTR(-EIO); } static void omfs_put_super(struct super_block *sb) { struct omfs_sb_info *sbi = OMFS_SB(sb); kfree(sbi->s_imap); kfree(sbi); sb->s_fs_info = NULL; } static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct omfs_sb_info *sbi = OMFS_SB(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); buf->f_type = OMFS_MAGIC; buf->f_bsize = sbi->s_blocksize; buf->f_blocks = sbi->s_num_blocks; buf->f_files = sbi->s_num_blocks; buf->f_namelen = OMFS_NAMELEN; buf->f_fsid = u64_to_fsid(id); buf->f_bfree = buf->f_bavail = buf->f_ffree = omfs_count_free(s); return 0; } /* * Display the mount options in /proc/mounts. */ static int omfs_show_options(struct seq_file *m, struct dentry *root) { struct omfs_sb_info *sbi = OMFS_SB(root->d_sb); umode_t cur_umask = current_umask(); if (!uid_eq(sbi->s_uid, current_uid())) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->s_uid)); if (!gid_eq(sbi->s_gid, current_gid())) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->s_gid)); if (sbi->s_dmask == sbi->s_fmask) { if (sbi->s_fmask != cur_umask) seq_printf(m, ",umask=%o", sbi->s_fmask); } else { if (sbi->s_dmask != cur_umask) seq_printf(m, ",dmask=%o", sbi->s_dmask); if (sbi->s_fmask != cur_umask) seq_printf(m, ",fmask=%o", sbi->s_fmask); } return 0; } static const struct super_operations omfs_sops = { .write_inode = omfs_write_inode, .evict_inode = omfs_evict_inode, .put_super = omfs_put_super, .statfs = omfs_statfs, .show_options = omfs_show_options, }; /* * For Rio Karma, there is an on-disk free bitmap whose location is * stored in the root block. For ReplayTV, there is no such free bitmap * so we have to walk the tree. Both inodes and file data are allocated * from the same map. This array can be big (300k) so we allocate * in units of the blocksize. */ static int omfs_get_imap(struct super_block *sb) { unsigned int bitmap_size, array_size; int count; struct omfs_sb_info *sbi = OMFS_SB(sb); struct buffer_head *bh; unsigned long **ptr; sector_t block; bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8); array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize); if (sbi->s_bitmap_ino == ~0ULL) goto out; sbi->s_imap_size = array_size; sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL); if (!sbi->s_imap) goto nomem; block = clus_to_blk(sbi, sbi->s_bitmap_ino); if (block >= sbi->s_num_blocks) goto nomem; ptr = sbi->s_imap; for (count = bitmap_size; count > 0; count -= sb->s_blocksize) { bh = sb_bread(sb, block++); if (!bh) goto nomem_free; *ptr = kmemdup(bh->b_data, sb->s_blocksize, GFP_KERNEL); if (!*ptr) { brelse(bh); goto nomem_free; } if (count < sb->s_blocksize) memset((void *)*ptr + count, 0xff, sb->s_blocksize - count); brelse(bh); ptr++; } out: return 0; nomem_free: for (count = 0; count < array_size; count++) kfree(sbi->s_imap[count]); kfree(sbi->s_imap); nomem: sbi->s_imap = NULL; sbi->s_imap_size = 0; return -ENOMEM; } enum { Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_err }; static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_umask, "umask=%o"}, {Opt_dmask, "dmask=%o"}, {Opt_fmask, "fmask=%o"}, {Opt_err, NULL}, }; static int parse_options(char *options, struct omfs_sb_info *sbi) { char *p; substring_t args[MAX_OPT_ARGS]; int option; if (!options) return 1; while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_uid: if (match_int(&args[0], &option)) return 0; sbi->s_uid = make_kuid(current_user_ns(), option); if (!uid_valid(sbi->s_uid)) return 0; break; case Opt_gid: if (match_int(&args[0], &option)) return 0; sbi->s_gid = make_kgid(current_user_ns(), option); if (!gid_valid(sbi->s_gid)) return 0; break; case Opt_umask: if (match_octal(&args[0], &option)) return 0; sbi->s_fmask = sbi->s_dmask = option; break; case Opt_dmask: if (match_octal(&args[0], &option)) return 0; sbi->s_dmask = option; break; case Opt_fmask: if (match_octal(&args[0], &option)) return 0; sbi->s_fmask = option; break; default: return 0; } } return 1; } static int omfs_fill_super(struct super_block *sb, void *data, int silent) { struct buffer_head *bh, *bh2; struct omfs_super_block *omfs_sb; struct omfs_root_block *omfs_rb; struct omfs_sb_info *sbi; struct inode *root; int ret = -EINVAL; sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; sbi->s_uid = current_uid(); sbi->s_gid = current_gid(); sbi->s_dmask = sbi->s_fmask = current_umask(); if (!parse_options((char *) data, sbi)) goto end; sb->s_maxbytes = 0xffffffff; sb->s_time_gran = NSEC_PER_MSEC; sb->s_time_min = 0; sb->s_time_max = U64_MAX / MSEC_PER_SEC; sb_set_blocksize(sb, 0x200); bh = sb_bread(sb, 0); if (!bh) goto end; omfs_sb = (struct omfs_super_block *)bh->b_data; if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) { if (!silent) printk(KERN_ERR "omfs: Invalid superblock (%x)\n", omfs_sb->s_magic); goto out_brelse_bh; } sb->s_magic = OMFS_MAGIC; sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks); sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize); sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors); sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block); sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize); mutex_init(&sbi->s_bitmap_lock); if (sbi->s_num_blocks > OMFS_MAX_BLOCKS) { printk(KERN_ERR "omfs: sysblock number (%llx) is out of range\n", (unsigned long long)sbi->s_num_blocks); goto out_brelse_bh; } if (sbi->s_sys_blocksize > PAGE_SIZE) { printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n", sbi->s_sys_blocksize); goto out_brelse_bh; } if (sbi->s_blocksize < sbi->s_sys_blocksize || sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) { printk(KERN_ERR "omfs: block size (%d) is out of range\n", sbi->s_blocksize); goto out_brelse_bh; } /* * Use sys_blocksize as the fs block since it is smaller than a * page while the fs blocksize can be larger. */ sb_set_blocksize(sb, sbi->s_sys_blocksize); /* * ...and the difference goes into a shift. sys_blocksize is always * a power of two factor of blocksize. */ sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) - get_bitmask_order(sbi->s_sys_blocksize); bh2 = omfs_bread(sb, be64_to_cpu(omfs_sb->s_root_block)); if (!bh2) goto out_brelse_bh; omfs_rb = (struct omfs_root_block *)bh2->b_data; sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap); sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize); if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) { printk(KERN_ERR "omfs: block count discrepancy between " "super and root blocks (%llx, %llx)\n", (unsigned long long)sbi->s_num_blocks, (unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks)); goto out_brelse_bh2; } if (sbi->s_bitmap_ino != ~0ULL && sbi->s_bitmap_ino > sbi->s_num_blocks) { printk(KERN_ERR "omfs: free space bitmap location is corrupt " "(%llx, total blocks %llx)\n", (unsigned long long) sbi->s_bitmap_ino, (unsigned long long) sbi->s_num_blocks); goto out_brelse_bh2; } if (sbi->s_clustersize < 1 || sbi->s_clustersize > OMFS_MAX_CLUSTER_SIZE) { printk(KERN_ERR "omfs: cluster size out of range (%d)", sbi->s_clustersize); goto out_brelse_bh2; } ret = omfs_get_imap(sb); if (ret) goto out_brelse_bh2; sb->s_op = &omfs_sops; root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir)); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_brelse_bh2; } sb->s_root = d_make_root(root); if (!sb->s_root) { ret = -ENOMEM; goto out_brelse_bh2; } printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name); ret = 0; out_brelse_bh2: brelse(bh2); out_brelse_bh: brelse(bh); end: if (ret) kfree(sbi); return ret; } static struct dentry *omfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super); } static struct file_system_type omfs_fs_type = { .owner = THIS_MODULE, .name = "omfs", .mount = omfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("omfs"); static int __init init_omfs_fs(void) { return register_filesystem(&omfs_fs_type); } static void __exit exit_omfs_fs(void) { unregister_filesystem(&omfs_fs_type); } module_init(init_omfs_fs); module_exit(exit_omfs_fs);
14 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. */ #ifndef __LOG_DOT_H__ #define __LOG_DOT_H__ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/writeback.h> #include "incore.h" #include "inode.h" /* * The minimum amount of log space required for a log flush is one block for * revokes and one block for the log header. Log flushes other than * GFS2_LOG_HEAD_FLUSH_NORMAL may write one or two more log headers. */ #define GFS2_LOG_FLUSH_MIN_BLOCKS 4 /** * gfs2_log_lock - acquire the right to mess with the log manager * @sdp: the filesystem * */ static inline void gfs2_log_lock(struct gfs2_sbd *sdp) __acquires(&sdp->sd_log_lock) { spin_lock(&sdp->sd_log_lock); } /** * gfs2_log_unlock - release the right to mess with the log manager * @sdp: the filesystem * */ static inline void gfs2_log_unlock(struct gfs2_sbd *sdp) __releases(&sdp->sd_log_lock) { spin_unlock(&sdp->sd_log_lock); } static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, unsigned int value) { if (++value == sdp->sd_jdesc->jd_blocks) { value = 0; } sdp->sd_log_tail = value; sdp->sd_log_flush_tail = value; sdp->sd_log_head = value; } static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); if (gfs2_is_jdata(ip) || !gfs2_is_ordered(sdp)) return; if (list_empty(&ip->i_ordered)) { spin_lock(&sdp->sd_ordered_lock); if (list_empty(&ip->i_ordered)) list_add(&ip->i_ordered, &sdp->sd_log_ordered); spin_unlock(&sdp->sd_ordered_lock); } } extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); extern bool gfs2_log_is_empty(struct gfs2_sbd *sdp); extern void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes); extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); extern bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, unsigned int *extra_revokes); extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, unsigned int *extra_revokes); extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, u64 seq, u32 tail, u32 lblock, u32 flags, int op_flags); extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 type); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); extern void log_flush_wait(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); extern void gfs2_flush_revokes(struct gfs2_sbd *sdp); extern void gfs2_ail_drain(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */
40 40 228 228 227 228 228 228 228 569 570 569 145 145 145 315 313 277 273 121 273 26 27 1 12 121 121 4 116 2 6 2 37 144 37 367 1 363 9 37 4 309 444 309 306 307 43 5 1 2 6 6 36 318 192 5 4 37 240 43 217 37 34 3 37 2 37 6 1 117 152 133 17 17 13 5 8 4 4 22 189 20 190 201 5 211 211 7 91 73 131 204 1 571 569 5 5 570 571 573 569 5 5 2 2 174 3 3 2 1 69 5 69 6 71 4 78 78 4 3 69 4 73 73 11 11 10 11 4 4 4 11 177 6 63 638 1 121 482 13 4 4 62 11 4 4 1 4 58 29 3 45 2 2 175 175 175 175 1 2 2 1 23 4 19 5 8 8 8 8 8 3 3 6 1 1 4 3 1 360 4 359 353 5 5 359 300 300 298 74 2 1 1 2 288 8 190 190 266 2 1 1 16 248 78 197 2 11 185 1 81 1 9 9 21 1 11 19 86 81 9 15 88 77 18 11 4 2 2 1 1 1 2 2 2 1 573 380 573 1 1 88 88 1 1 77 78 225 40 1 8 2 3 3 2 1 10 7 2 2 4 5 2 500 635 636 174 1 3 178 178 4 178 1 2 171 173 3 732 4 176 22 605 37 37 37 37 178 178 178 174 174 145 173 173 170 4 168 9 166 2 7 173 630 630 628 630 628 631 629 631 151 2 503 632 502 505 505 12 8 4 4 4 3 1 2 2 2 2 505 503 505 2 506 505 1 1 507 502 61 503 506 2 502 1 506 4 503 3 1 2 1 8 8 8 1 1 2 5 3 2 57 598 595 597 57 592 2 585 7 7 4 3 1 1 1 2 1 1 1 302 13 31 4 2 1 6 1 1 1 1 9 4 16 1 3 1 2 3 1 3 2 4 3 173 173 4 3 5 6 5 9 8 1 3 1 1 1 3 1 1 4 3 2 1 4 1 8 47 2 1 1 5 1 1 25 30 2 1 1 2 1 249 49 9 3 1 1 1 1 3 2 2 1 65 1 1 1 1 1 1 107 1 1 2 1 2 1 1 2 1 33 1 1 1 24 1 3 1 1 1 13 257 249 105 2 3 329 101 103 2 2 4 2 78 60 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 136 100 1 1 1 1 14 84 1 1 1 1 25 5 1119 1116 1117 1118 1119 5 332 335 333 5 336 336 1063 1060 31 31 6 41 438 42 12 28 1 1 1 4 1 6 1 1 3 2 5 4 2 1 1 2 15 15 15 1 14 1 1 13 14 15 25 1 1 1 22 23 15 11 10 19 14 6 10 9 19 3 11 5 4 5 4 9 5 1 4 1 1 1 1 2 2 1 1 1 3 1 2 1 6 2 3 3 3 1 13 1 3 1 1 2 1 1 2 1 3 528 2 3 2 32 1 31 1 1 4 1 8 1 118 1 2 1 80 227 226 1 1 7 1 5 1 6 1 4 4 1 17 1 1 4 1 1 3 1 1 6 5 4 1 1 1 1 12 1 1 3 2 1 17 3 1 7 1 13 5 1 22 2 1 1 1 1 2 1 1 25 1 5 3 18 3 3 3 1 2 6 6 15 3 15 27 1 1 1 1 1 2 1 2 2 2 2 1 2 1 3 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 32 8 6 2 1 2 8 1 6 7 1 4 3 2 2 3 3 2 116 3 2 1 1 1 1 1 1 15 2 6 1 7 1 1 2 1 8 25 1 1 3 1 1 3 1 1 2 1 1 6 1 4 1 5 1 1 1 1 1 1 1 1 1 12 3 8 1 70 70 15 65 65 65 156 4 14 470 27 463 77 78 87 88 88 20 1 4 20 20 20 3 3 152 152 160 84 130 8 125 39 69 209 177 60 128 100 100 70 70 65 62 227 131 148 131 159 15 124 124 244 245 226 1 1 7 143 123 209 120 54 54 10 2 2 40 2 1 41 10 42 38 64 83 52 38 38 38 8 26 5 64 64 29 1 8 8 5 5 3 16 16 14 122 75 2 97 2 28 13 11 6 2 121 71 5 3 2 475 474 13 10 463 69 475 70 119 106 105 3 4 4 1 1 27 3 2 1 1 7 7 316 317 215 214 1 2 2 299 6 1 130 35 106 26 487 487 254 1 247 1 2 184 3 182 7 176 217 118 115 114 1 208 73 120 132 11 11 11 9 9 6 2 4 89 4 271 2 473 5 476 5 1 497 87 2 478 312 152 1 37 37 168 338 1 339 332 26 274 123 17 129 14 10 1 204 124 301 239 131 241 241 240 299 3 40 9 5 4 28 18 1 4 1 3 3 3 18 10 48 18 38 48 28 39 40 39 9 47 56 56 56 56 23 151 2 2 2 2 2 2 8 8 8 8 1 2 1 4 8 3 3 593 559 98 470 560 549 10 161 14 65 36 366 367 211 332 23 516 31 587 585 15 549 20 323 330 15 14 2 2 580 4 7 2 577 7 4 23 23 4 556 42 3 27 29 21 30 571 11 572 4 53 5 5 5 5 5 5 3 3 3 3 3 3 3 3 5 3 5 7 7 4 3 99 8 8 8 64 64 4 55 5 55 57 58 2 56 130 130 3 108 503 501 100 596 593 4 595 423 594 595 596 596 596 547 507 596 595 31 597 1 595 1 1 385 595 591 593 23 593 591 596 591 596 596 593 2 594 591 31 594 2 595 589 487 589 585 2 587 27 552 533 571 574 573 574 391 555 52 558 443 17 13 569 571 1 1 573 13 571 573 574 572 573 572 72 99 573 572 572 560 15 428 16 429 11 22 11 8 8 16 610 36 37 597 22 597 541 7 538 5 2 542 83 84 536 87 5 5 84 85 4 82 83 2 3 82 1 1 182 629 625 629 624 558 555 623 16 7 1 1 598 14 112 504 547 2 599 541 15 561 13 11 2 13 13 2 85 85 83 100 99 100 100 99 94 2 5 4 1 83 83 2 2 2 1 2 9 9 3 3 5 1 5 15 15 129 3 7 129 6 6 135 135 2 116 116 115 116 3 102 12 113 116 116 51 119 12 13 97 88 24 22 11 11 3 1 11 4 108 6 1 1 3 3 3 3 6 4 4 1 1 1 1 4 11 2 13 4 11 11 2 3 2 9 4 2 6 173 15 173 173 173 173 35 139 173 173 170 7 165 15 15 15 5 10 177 177 177 177 176 177 173 2 175 177 176 1 173 4 2 145 145 145 145 27 27 177 35 4 633 636 31 152 1 151 149 30 30 30 11 29 61 61 61 61 31 4 31 4 29 30 30 30 30 11 124 124 124 114 114 2 122 35 113 124 132 132 132 132 124 20 123 20 127 3 2 132 132 132 29 11 6 2 37 1 1 5 30 2 31 17 9 6 23 19 4 38 38 129 42 27 15 27 625 7 268 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 1 1 2 2 1 3 4 6 1 5 447 7 6 28 5 5 5 5 2 5 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849 12850 12851 12852 12853 12854 12855 12856 12857 12858 12859 12860 12861 12862 12863 12864 12865 12866 12867 12868 12869 12870 12871 12872 12873 12874 12875 12876 12877 12878 12879 12880 12881 12882 12883 12884 12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * derived from drivers/kvm/kvm_main.c * * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> */ #include <linux/kvm_host.h> #include "irq.h" #include "ioapic.h" #include "mmu.h" #include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" #include "hyperv.h" #include "lapic.h" #include "xen.h" #include <linux/clocksource.h> #include <linux/interrupt.h> #include <linux/kvm.h> #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/export.h> #include <linux/moduleparam.h> #include <linux/mman.h> #include <linux/highmem.h> #include <linux/iommu.h> #include <linux/intel-iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> #include <linux/srcu.h> #include <linux/slab.h> #include <linux/perf_event.h> #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/pci.h> #include <linux/timekeeper_internal.h> #include <linux/pvclock_gtod.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> #include <linux/sched/stat.h> #include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> #include <linux/entry-kvm.h> #include <linux/suspend.h> #include <trace/events/kvm.h> #include <asm/debugreg.h> #include <asm/msr.h> #include <asm/desc.h> #include <asm/mce.h> #include <asm/pkru.h> #include <linux/kernel_stat.h> #include <asm/fpu/internal.h> /* Ugh! */ #include <asm/pvclock.h> #include <asm/div64.h> #include <asm/irq_remapping.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/tlbflush.h> #include <asm/intel_pt.h> #include <asm/emulate_prefix.h> #include <asm/sgx.h> #include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS #include "trace.h" #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ ((struct kvm_vcpu *)(ctxt)->vcpu) /* EFER defaults: * - enable syscall per default because its emulated by KVM * - enable LME and LMA per default on 64 bit KVM */ #ifdef CONFIG_X86_64 static u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); #else static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void process_smi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); #define KVM_X86_OP(func) \ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_NULL KVM_X86_OP #include <asm/kvm-x86-ops.h> EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); EXPORT_SYMBOL_GPL(report_ignored_msrs); unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); bool __read_mostly kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 __read_mostly kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); u64 __read_mostly kvm_max_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); u64 __read_mostly kvm_default_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); bool __read_mostly kvm_has_bus_lock_exit; EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables * adaptive tuning starting from default advancement of 1000ns. '0' disables * advancement entirely. Any other value is used as-is and disables adaptive * tuning, i.e. allows privileged userspace to set an exact advancement time. */ static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); bool __read_mostly enable_vmware_backdoor = false; module_param(enable_vmware_backdoor, bool, S_IRUGO); EXPORT_SYMBOL_GPL(enable_vmware_backdoor); static bool __read_mostly force_emulation_prefix = false; module_param(force_emulation_prefix, bool, S_IRUGO); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); /* Enable/disable SMT_RSB bug mitigation */ bool __read_mostly mitigate_smt_rsb; module_param(mitigate_smt_rsb, bool, 0444); /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU * returns to userspace, i.e. the kernel can run with the guest's value. */ #define KVM_MAX_NR_USER_RETURN_MSRS 16 struct kvm_user_return_msrs { struct user_return_notifier urn; bool registered; struct kvm_user_return_msr_values { u64 host; u64 curr; } values[KVM_MAX_NR_USER_RETURN_MSRS]; }; u32 __read_mostly kvm_nr_uret_msrs; EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU) u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), STATS_DESC_COUNTER(VM, mmu_pte_write), STATS_DESC_COUNTER(VM, mmu_pde_zapped), STATS_DESC_COUNTER(VM, mmu_flooded), STATS_DESC_COUNTER(VM, mmu_recycled), STATS_DESC_COUNTER(VM, mmu_cache_miss), STATS_DESC_ICOUNTER(VM, mmu_unsync), STATS_DESC_ICOUNTER(VM, pages_4k), STATS_DESC_ICOUNTER(VM, pages_2m), STATS_DESC_ICOUNTER(VM, pages_1g), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), .id_offset = sizeof(struct kvm_stats_header), .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + sizeof(kvm_vm_stats_desc), }; const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), STATS_DESC_COUNTER(VCPU, pf_fixed), STATS_DESC_COUNTER(VCPU, pf_guest), STATS_DESC_COUNTER(VCPU, tlb_flush), STATS_DESC_COUNTER(VCPU, invlpg), STATS_DESC_COUNTER(VCPU, exits), STATS_DESC_COUNTER(VCPU, io_exits), STATS_DESC_COUNTER(VCPU, mmio_exits), STATS_DESC_COUNTER(VCPU, signal_exits), STATS_DESC_COUNTER(VCPU, irq_window_exits), STATS_DESC_COUNTER(VCPU, nmi_window_exits), STATS_DESC_COUNTER(VCPU, l1d_flush), STATS_DESC_COUNTER(VCPU, halt_exits), STATS_DESC_COUNTER(VCPU, request_irq_exits), STATS_DESC_COUNTER(VCPU, irq_exits), STATS_DESC_COUNTER(VCPU, host_state_reload), STATS_DESC_COUNTER(VCPU, fpu_reload), STATS_DESC_COUNTER(VCPU, insn_emulation), STATS_DESC_COUNTER(VCPU, insn_emulation_fail), STATS_DESC_COUNTER(VCPU, hypercalls), STATS_DESC_COUNTER(VCPU, irq_injections), STATS_DESC_COUNTER(VCPU, nmi_injections), STATS_DESC_COUNTER(VCPU, req_event), STATS_DESC_COUNTER(VCPU, nested_run), STATS_DESC_COUNTER(VCPU, directed_yield_attempted), STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_COUNTER(VCPU, preemption_reported), STATS_DESC_COUNTER(VCPU, preemption_other), STATS_DESC_ICOUNTER(VCPU, guest_mode) }; const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), .id_offset = sizeof(struct kvm_stats_header), .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + sizeof(kvm_vcpu_stats_desc), }; u64 __read_mostly host_xcr0; u64 __read_mostly supported_xcr0; EXPORT_SYMBOL_GPL(supported_xcr0); static struct kmem_cache *x86_fpu_cache; static struct kmem_cache *x86_emulator_cache; /* * When called, it means the previous get/set msr reached an invalid msr. * Return true if we want to ignore/silent this failed msr access. */ static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) { const char *op = write ? "wrmsr" : "rdmsr"; if (ignore_msrs) { if (report_ignored_msrs) kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, data); /* Mask the error */ return true; } else { kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", op, msr, data); return false; } } static struct kmem_cache *kvm_alloc_emulator_cache(void) { unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); unsigned int size = sizeof(struct x86_emulate_ctxt); return kmem_cache_create_usercopy("x86_emulator", size, __alignof__(struct x86_emulate_ctxt), SLAB_ACCOUNT, useroffset, size - useroffset, NULL); } static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; for (i = 0; i < ASYNC_PF_PER_VCPU; i++) vcpu->arch.apf.gfns[i] = ~0; } static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; struct kvm_user_return_msrs *msrs = container_of(urn, struct kvm_user_return_msrs, urn); struct kvm_user_return_msr_values *values; unsigned long flags; /* * Disabling irqs at this point since the following code could be * interrupted and executed through kvm_arch_hardware_disable() */ local_irq_save(flags); if (msrs->registered) { msrs->registered = false; user_return_notifier_unregister(urn); } local_irq_restore(flags); for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { wrmsrl(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } } static int kvm_probe_user_return_msr(u32 msr) { u64 val; int ret; preempt_disable(); ret = rdmsrl_safe(msr, &val); if (ret) goto out; ret = wrmsrl_safe(msr, val); out: preempt_enable(); return ret; } int kvm_add_user_return_msr(u32 msr) { BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS); if (kvm_probe_user_return_msr(msr)) return -1; kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; return kvm_nr_uret_msrs++; } EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); int kvm_find_user_return_msr(u32 msr) { int i; for (i = 0; i < kvm_nr_uret_msrs; ++i) { if (kvm_uret_msrs_list[i] == msr) return i; } return -1; } EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { unsigned int cpu = smp_processor_id(); struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); u64 value; int i; for (i = 0; i < kvm_nr_uret_msrs; ++i) { rdmsrl_safe(kvm_uret_msrs_list[i], &value); msrs->values[i].host = value; msrs->values[i].curr = value; } } int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); int err; value = (value & mask) | (msrs->values[slot].host & ~mask); if (value == msrs->values[slot].curr) return 0; err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; msrs->values[slot].curr = value; if (!msrs->registered) { msrs->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&msrs->urn); msrs->registered = true; } return 0; } EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { unsigned int cpu = smp_processor_id(); struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); if (msrs->registered) kvm_on_user_return(&msrs->urn); } u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { return vcpu->arch.apic_base; } EXPORT_SYMBOL_GPL(kvm_get_apic_base); enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) { return kvm_apic_mode(kvm_get_apic_base(vcpu)); } EXPORT_SYMBOL_GPL(kvm_get_apic_mode); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) return 1; if (!msr_info->host_initiated) { if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) return 1; if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) return 1; } kvm_lapic_set_base(vcpu, msr_info->data); kvm_recalculate_apic_map(vcpu->kvm); return 0; } EXPORT_SYMBOL_GPL(kvm_set_apic_base); /* * Handle a fault on a hardware virtualization (VMX or SVM) instruction. * * Hardware virtualization extension instructions may fault if a reboot turns * off virtualization while processes are running. Usually after catching the * fault we just panic; during reboot instead the instruction is ignored. */ noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); #define EXCPT_BENIGN 0 #define EXCPT_CONTRIBUTORY 1 #define EXCPT_PF 2 static int exception_class(int vector) { switch (vector) { case PF_VECTOR: return EXCPT_PF; case DE_VECTOR: case TS_VECTOR: case NP_VECTOR: case SS_VECTOR: case GP_VECTOR: return EXCPT_CONTRIBUTORY; default: break; } return EXCPT_BENIGN; } #define EXCPT_FAULT 0 #define EXCPT_TRAP 1 #define EXCPT_ABORT 2 #define EXCPT_INTERRUPT 3 #define EXCPT_DB 4 static int exception_type(int vector) { unsigned int mask; if (WARN_ON(vector > 31 || vector == NMI_VECTOR)) return EXCPT_INTERRUPT; mask = 1 << vector; /* * #DBs can be trap-like or fault-like, the caller must check other CPU * state, e.g. DR6, to determine whether a #DB is a trap or fault. */ if (mask & (1 << DB_VECTOR)) return EXCPT_DB; if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR))) return EXCPT_TRAP; if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) return EXCPT_ABORT; /* Reserved exceptions will result in fault */ return EXCPT_FAULT; } void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) { unsigned nr = vcpu->arch.exception.nr; bool has_payload = vcpu->arch.exception.has_payload; unsigned long payload = vcpu->arch.exception.payload; if (!has_payload) return; switch (nr) { case DB_VECTOR: /* * "Certain debug exceptions may clear bit 0-3. The * remaining contents of the DR6 register are never * cleared by the processor". */ vcpu->arch.dr6 &= ~DR_TRAP_BITS; /* * In order to reflect the #DB exception payload in guest * dr6, three components need to be considered: active low * bit, FIXED_1 bits and active high bits (e.g. DR6_BD, * DR6_BS and DR6_BT) * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits. * In the target guest dr6: * FIXED_1 bits should always be set. * Active low bits should be cleared if 1-setting in payload. * Active high bits should be set if 1-setting in payload. * * Note, the payload is compatible with the pending debug * exceptions/exit qualification under VMX, that active_low bits * are active high in payload. * So they need to be flipped for DR6. */ vcpu->arch.dr6 |= DR6_ACTIVE_LOW; vcpu->arch.dr6 |= payload; vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW; /* * The #DB payload is defined as compatible with the 'pending * debug exceptions' field under VMX, not DR6. While bit 12 is * defined in the 'pending debug exceptions' field (enabled * breakpoint), it is reserved and must be zero in DR6. */ vcpu->arch.dr6 &= ~BIT(12); break; case PF_VECTOR: vcpu->arch.cr2 = payload; break; } vcpu->arch.exception.has_payload = false; vcpu->arch.exception.payload = 0; } EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); /* Forcibly leave the nested mode in cases like a vCPU reset */ static void kvm_leave_nested(struct kvm_vcpu *vcpu) { kvm_x86_ops.nested_ops->leave_nested(vcpu); } static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool has_payload, unsigned long payload, bool reinject) { u32 prev_nr; int class1, class2; kvm_make_request(KVM_REQ_EVENT, vcpu); if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: if (reinject) { /* * On vmentry, vcpu->arch.exception.pending is only * true if an event injection was blocked by * nested_run_pending. In that case, however, * vcpu_enter_guest requests an immediate exit, * and the guest shouldn't proceed far enough to * need reinjection. */ WARN_ON_ONCE(vcpu->arch.exception.pending); vcpu->arch.exception.injected = true; if (WARN_ON_ONCE(has_payload)) { /* * A reinjected event has already * delivered its payload. */ has_payload = false; payload = 0; } } else { vcpu->arch.exception.pending = true; vcpu->arch.exception.injected = false; } vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.nr = nr; vcpu->arch.exception.error_code = error_code; vcpu->arch.exception.has_payload = has_payload; vcpu->arch.exception.payload = payload; if (!is_guest_mode(vcpu)) kvm_deliver_exception_payload(vcpu); return; } /* to check exception */ prev_nr = vcpu->arch.exception.nr; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } class1 = exception_class(prev_nr); class2 = exception_class(nr); if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { /* * Generate double fault per SDM Table 5-5. Set * exception.pending = true so that the double fault * can trigger a nested vmexit. */ vcpu->arch.exception.pending = true; vcpu->arch.exception.injected = false; vcpu->arch.exception.has_error_code = true; vcpu->arch.exception.nr = DF_VECTOR; vcpu->arch.exception.error_code = 0; vcpu->arch.exception.has_payload = false; vcpu->arch.exception.payload = 0; } else /* replace previous exception with a new one in a hope that instruction re-execution will regenerate lost exception */ goto queue; } void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) { kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); } EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload) { kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) { kvm_multiple_exception(vcpu, nr, true, error_code, true, payload, false); } int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) kvm_inject_gp(vcpu, 0); else return kvm_skip_emulated_instruction(vcpu); return 1; } EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; vcpu->arch.exception.nested_apf = is_guest_mode(vcpu) && fault->async_page_fault; if (vcpu->arch.exception.nested_apf) { vcpu->arch.apf.nested_apf_token = fault->address; kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } else { kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, fault->address); } } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct kvm_mmu *fault_mmu; WARN_ON_ONCE(fault->vector != PF_VECTOR); fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : vcpu->arch.walk_mmu; /* * Invalidate the TLB entry for the faulting address, if it exists, * else the access will fault indefinitely (and to emulate hardware). */ if ((fault->error_code & PFERR_PRESENT_MASK) && !(fault->error_code & PFERR_RSVD_MASK)) kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, fault_mmu->root_hpa); fault_mmu->inject_page_fault(vcpu, fault); return fault->nested_page_fault; } EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { atomic_inc(&vcpu->arch.nmi_queued); kvm_make_request(KVM_REQ_NMI, vcpu); } EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); } EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; } EXPORT_SYMBOL_GPL(kvm_require_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE)) return true; kvm_queue_exception(vcpu, UD_VECTOR); return false; } EXPORT_SYMBOL_GPL(kvm_require_dr); /* * This function will be used to read from the physical memory of the currently * running guest. The difference to kvm_vcpu_read_guest_page is that this function * can read from guest physical or from the guest's guest physical memory. */ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t ngfn, void *data, int offset, int len, u32 access) { struct x86_exception exception; gfn_t real_gfn; gpa_t ngpa; ngpa = gfn_to_gpa(ngfn); real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); if (real_gfn == UNMAPPED_GVA) return -EFAULT; real_gfn = gpa_to_gfn(real_gfn); return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); } /* * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; int ret; u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, offset * sizeof(u64), sizeof(pdpte), PFERR_USER_MASK|PFERR_WRITE_MASK); if (ret < 0) { ret = 0; goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & pdptr_rsvd_bits(vcpu))) { ret = 0; goto out; } } ret = 1; memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); vcpu->arch.pdptrs_from_userspace = false; out: return ret; } EXPORT_SYMBOL_GPL(load_pdptrs); static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { #ifdef CONFIG_X86_64 if (cr0 & 0xffffffff00000000UL) return false; #endif if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) return false; if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) return false; return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0); } void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); } if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) kvm_mmu_reset_context(vcpu); if (((cr0 ^ old_cr0) & X86_CR0_CD) && kvm_arch_has_noncoherent_dma(vcpu->kvm) && !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); } EXPORT_SYMBOL_GPL(kvm_post_set_cr0); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; if (!kvm_is_valid_cr0(vcpu, cr0)) return 1; cr0 |= X86_CR0_ET; /* Write to CR0 reserved bits are ignored, even on Intel. */ cr0 &= ~CR0_RESERVED_BITS; #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && (cr0 & X86_CR0_PG)) { int cs_db, cs_l; if (!is_pae(vcpu)) return 1; static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); if (cs_l) return 1; } #endif if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) return 1; if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) return 1; static_call(kvm_x86_set_cr0)(vcpu, cr0); kvm_post_set_cr0(vcpu, old_cr0, cr0); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); if (vcpu->arch.xsaves_enabled && vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } if (static_cpu_has(X86_FEATURE_PKU) && (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.pkru); } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; if (static_cpu_has(X86_FEATURE_PKU) && (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.host_pkru); } if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); if (vcpu->arch.xsaves_enabled && vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, host_xss); } } EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; u64 valid_bits; /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ if (index != XCR_XFEATURE_ENABLED_MASK) return 1; if (!(xcr0 & XFEATURE_MASK_FP)) return 1; if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE)) return 1; /* * Do not allow the guest to set bits that we do not support * saving. However, xcr0 bit 0 is always set, even if the * emulated CPU does not support XSAVE (see fx_init). */ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) return 1; if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) != (!(xcr0 & XFEATURE_MASK_BNDCSR))) return 1; if (xcr0 & XFEATURE_MASK_AVX512) { if (!(xcr0 & XFEATURE_MASK_YMM)) return 1; if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) return 1; } vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) kvm_update_cpuid_runtime(vcpu); return 0; } int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { kvm_inject_gp(vcpu, 0); return 1; } return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return false; if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return false; return true; } EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { return __kvm_is_valid_cr4(vcpu, cr4) && static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); } void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) || (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP; if (!kvm_is_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; if ((cr4 ^ old_cr4) & X86_CR4_LA57) return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) return 1; if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) return 1; /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) return 1; } static_call(kvm_x86_set_cr4)(vcpu, cr4); kvm_post_set_cr4(vcpu, old_cr4, cr4); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr4); static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; unsigned long roots_to_free = 0; int i; /* * MOV CR3 and INVPCID are usually not intercepted when using TDP, but * this is reachable when running EPT=1 and unrestricted_guest=0, and * also via the emulator. KVM's TDP page tables are not in the scope of * the invalidation, but the guest's TLB entries need to be flushed as * the CPU may have cached entries in its TLB for the target PCID. */ if (unlikely(tdp_enabled)) { kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return; } /* * If neither the current CR3 nor any of the prev_roots use the given * PCID, then nothing needs to be done here because a resync will * happen anyway before switching to any other CR3. */ if (kvm_get_active_pcid(vcpu) == pcid) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); kvm_mmu_free_roots(vcpu, mmu, roots_to_free); } int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { bool skip_tlb_flush = false; unsigned long pcid = 0; #ifdef CONFIG_X86_64 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); if (pcid_enabled) { skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; cr3 &= ~X86_CR3_PCID_NOFLUSH; pcid = cr3 & X86_CR3_PCID_MASK; } #endif /* PDPTRs are always reloaded for PAE paging. */ if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) goto handle_tlb_flush; /* * Do not condition the GPA check on long mode, this helper is used to * stuff CR3, e.g. for RSM emulation, and there is no guarantee that * the current vCPU mode is accurate. */ if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return 1; if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; if (cr3 != kvm_read_cr3(vcpu)) kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); handle_tlb_flush: /* * A load of CR3 that flushes the TLB flushes only the current PCID, * even if PCID is disabled, in which case PCID=0 is flushed. It's a * moot point in the end because _disabling_ PCID will flush all PCIDs, * and it's impossible to use a non-zero PCID when PCID is disabled, * i.e. only PCID=0 can be relevant. */ if (!skip_tlb_flush) kvm_invalidate_pcid(vcpu, pcid); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) return 1; if (lapic_in_kernel(vcpu)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) return kvm_lapic_get_cr8(vcpu); else return vcpu->arch.cr8; } EXPORT_SYMBOL_GPL(kvm_get_cr8); static void kvm_update_dr0123(struct kvm_vcpu *vcpu) { int i; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; } } void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; static_call(kvm_x86_set_dr7)(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } EXPORT_SYMBOL_GPL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { u64 fixed = DR6_FIXED_1; if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) fixed |= DR6_RTM; if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) fixed |= DR6_BUS_LOCK; return fixed; } int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { size_t size = ARRAY_SIZE(vcpu->arch.db); switch (dr) { case 0 ... 3: vcpu->arch.db[array_index_nospec(dr, size)] = val; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; case 4: case 6: if (!kvm_dr6_valid(val)) return 1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); break; case 5: default: /* 7 */ if (!kvm_dr7_valid(val)) return 1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; kvm_update_dr7(vcpu); break; } return 0; } EXPORT_SYMBOL_GPL(kvm_set_dr); void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { size_t size = ARRAY_SIZE(vcpu->arch.db); switch (dr) { case 0 ... 3: *val = vcpu->arch.db[array_index_nospec(dr, size)]; break; case 4: case 6: *val = vcpu->arch.dr6; break; case 5: default: /* 7 */ *val = vcpu->arch.dr7; break; } } EXPORT_SYMBOL_GPL(kvm_get_dr); int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { kvm_inject_gp(vcpu, 0); return 1; } kvm_rax_write(vcpu, (u32)data); kvm_rdx_write(vcpu, data >> 32); return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) * extract the supported MSRs from the related const lists. * msrs_to_save is selected from the msrs_to_save_all to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs * may depend on host virtualization features rather than host cpu features. */ static const u32 msrs_to_save_all[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, MSR_IA32_SPEC_CTRL, MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; static unsigned num_msrs_to_save; static const u32 emulated_msrs_all[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_RESET, HV_X64_MSR_VP_INDEX, HV_X64_MSR_VP_RUNTIME, HV_X64_MSR_SCONTROL, HV_X64_MSR_STIMER0_CONFIG, HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_SYNDBG_OPTIONS, HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, HV_X64_MSR_SYNDBG_PENDING_BUFFER, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, MSR_IA32_TSC_DEADLINE, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, MSR_SMI_COUNT, MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, MSR_IA32_POWER_CTL, MSR_IA32_UCODE_REV, /* * The following list leaves out MSRs whose values are determined * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. * We always support the "true" VMX control MSRs, even if the host * processor does not, so I am putting these registers here rather * than in msrs_to_save_all. */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, MSR_IA32_VMX_TRUE_PROCBASED_CTLS, MSR_IA32_VMX_TRUE_EXIT_CTLS, MSR_IA32_VMX_TRUE_ENTRY_CTLS, MSR_IA32_VMX_MISC, MSR_IA32_VMX_CR0_FIXED0, MSR_IA32_VMX_CR4_FIXED0, MSR_IA32_VMX_VMCS_ENUM, MSR_IA32_VMX_PROCBASED_CTLS2, MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_VMFUNC, MSR_K7_HWCR, MSR_KVM_POLL_CONTROL, }; static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; static unsigned num_emulated_msrs; /* * List of msr numbers which are used to expose MSR-based features that * can be used by a hypervisor to validate requested CPU features. */ static const u32 msr_based_features_all[] = { MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, MSR_IA32_VMX_PINBASED_CTLS, MSR_IA32_VMX_TRUE_PROCBASED_CTLS, MSR_IA32_VMX_PROCBASED_CTLS, MSR_IA32_VMX_TRUE_EXIT_CTLS, MSR_IA32_VMX_EXIT_CTLS, MSR_IA32_VMX_TRUE_ENTRY_CTLS, MSR_IA32_VMX_ENTRY_CTLS, MSR_IA32_VMX_MISC, MSR_IA32_VMX_CR0_FIXED0, MSR_IA32_VMX_CR0_FIXED1, MSR_IA32_VMX_CR4_FIXED0, MSR_IA32_VMX_CR4_FIXED1, MSR_IA32_VMX_VMCS_ENUM, MSR_IA32_VMX_PROCBASED_CTLS2, MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_VMFUNC, MSR_AMD64_DE_CFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, }; static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; /* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM * does not yet virtualize. These include: * 10 - MISC_PACKAGE_CTRLS * 11 - ENERGY_FILTERING_CTL * 12 - DOITM * 18 - FB_CLEAR_CTRL * 21 - XAPIC_DISABLE_STATUS * 23 - OVERCLOCKING_STATUS */ #define KVM_SUPPORTED_ARCH_CAP \ (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { u64 data = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); data &= KVM_SUPPORTED_ARCH_CAP; } /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that * the nested hypervisor runs with NX huge pages. If it is not, * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other * L1 guests, so it need not worry about its own (L2) guests. */ data |= ARCH_CAP_PSCHANGE_MC_NO; /* * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that * capability to the guest too, and if EPT is disabled we're not * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will * require a nested hypervisor to do a flush of its own. */ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) data |= ARCH_CAP_RDCL_NO; if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) data |= ARCH_CAP_SSB_NO; if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; if (!boot_cpu_has_bug(X86_BUG_ITS)) data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* * If RTM=0 because the kernel has disabled TSX, the host might * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 * and therefore knows that there cannot be TAA) but keep * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, * and we want to allow migrating those guests to tsx=off hosts. */ data &= ~ARCH_CAP_TAA_NO; } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { data |= ARCH_CAP_TAA_NO; } else { /* * Nothing to do here; we emulate TSX_CTRL if present on the * host so the guest can choose between disabling TSX or * using VERW to clear CPU buffers. */ } if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) data |= ARCH_CAP_GDS_NO; return data; } static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { case MSR_IA32_ARCH_CAPABILITIES: msr->data = kvm_get_arch_capabilities(); break; case MSR_IA32_UCODE_REV: rdmsrl_safe(msr->index, &msr->data); break; default: return static_call(kvm_x86_get_msr_feature)(msr); } return 0; } static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { struct kvm_msr_entry msr; int r; msr.index = index; r = kvm_get_msr_feature(&msr); if (r == KVM_MSR_RET_INVALID) { /* Unconditionally clear the output for simplicity */ *data = 0; if (kvm_msr_ignored_check(index, 0, false)) r = 0; } if (r) return r; *data = msr.data; return 0; } static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) return false; if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) return false; if (efer & (EFER_LME | EFER_LMA) && !guest_cpuid_has(vcpu, X86_FEATURE_LM)) return false; if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) return false; return true; } bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) return false; return __kvm_valid_efer(vcpu, efer); } EXPORT_SYMBOL_GPL(kvm_valid_efer); static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; u64 efer = msr_info->data; int r; if (efer & efer_reserved_bits) return 1; if (!msr_info->host_initiated) { if (!__kvm_valid_efer(vcpu, efer)) return 1; if (is_paging(vcpu) && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) return 1; } efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; r = static_call(kvm_x86_set_efer)(vcpu, efer); if (r) { WARN_ON(r > 0); return r; } if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) kvm_mmu_reset_context(vcpu); return 0; } void kvm_enable_efer_bits(u64 mask) { efer_reserved_bits &= ~mask; } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { struct kvm_x86_msr_filter *msr_filter; struct msr_bitmap_range *ranges; struct kvm *kvm = vcpu->kvm; bool allowed; int idx; u32 i; /* x2APIC MSRs do not support filtering. */ if (index >= 0x800 && index <= 0x8ff) return true; idx = srcu_read_lock(&kvm->srcu); msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); if (!msr_filter) { allowed = true; goto out; } allowed = msr_filter->default_allow; ranges = msr_filter->ranges; for (i = 0; i < msr_filter->count; i++) { u32 start = ranges[i].base; u32 end = start + ranges[i].nmsrs; u32 flags = ranges[i].flags; unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { allowed = !!test_bit(index - start, bitmap); break; } } out: srcu_read_unlock(&kvm->srcu, idx); return allowed; } EXPORT_SYMBOL_GPL(kvm_msr_allowed); /* * Write @data into the MSR specified by @index. Select MSR specific fault * checks are bypassed if @host_initiated is %true. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, bool host_initiated) { struct msr_data msr; if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) return KVM_MSR_RET_FILTERED; switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: case MSR_KERNEL_GS_BASE: case MSR_CSTAR: case MSR_LSTAR: if (is_noncanonical_address(data, vcpu)) return 1; break; case MSR_IA32_SYSENTER_EIP: case MSR_IA32_SYSENTER_ESP: /* * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if * non-canonical address is written on Intel but not on * AMD (which ignores the top 32-bits, because it does * not implement 64-bit SYSENTER). * * 64-bit code should hence be able to write a non-canonical * value on AMD. Making the address canonical ensures that * vmentry does not fail on Intel after writing a non-canonical * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ data = __canonical_address(data, vcpu_virt_addr_bits(vcpu)); break; case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) return 1; if (!host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) return 1; /* * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has * incomplete and conflicting architectural behavior. Current * AMD CPUs completely ignore bits 63:32, i.e. they aren't * reserved and always read as zeros. Enforce Intel's reserved * bits check if and only if the guest CPU is Intel, and clear * the bits in all other cases. This ensures cross-vendor * migration will provide consistent behavior for the guest. */ if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) return 1; data = (u32)data; break; } msr.data = data; msr.index = index; msr.host_initiated = host_initiated; return static_call(kvm_x86_set_msr)(vcpu, &msr); } static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 data, bool host_initiated) { int ret = __kvm_set_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) if (kvm_msr_ignored_check(index, data, true)) ret = 0; return ret; } /* * Read the MSR specified by @index into @data. Select MSR specific fault * checks are bypassed if @host_initiated is %true. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { struct msr_data msr; int ret; if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) return KVM_MSR_RET_FILTERED; switch (index) { case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) return 1; if (!host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) return 1; break; } msr.index = index; msr.host_initiated = host_initiated; ret = static_call(kvm_x86_get_msr)(vcpu, &msr); if (!ret) *data = msr.data; return ret; } static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { int ret = __kvm_get_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) { /* Unconditionally clear *data for simplicity */ *data = 0; if (kvm_msr_ignored_check(index, 0, false)) ret = 0; } return ret; } int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { return kvm_get_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_get_msr); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) { return kvm_set_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_set_msr); static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) { int err = vcpu->run->msr.error; if (!err) { kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); } return static_call(kvm_x86_complete_emulated_msr)(vcpu, err); } static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) { return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); } static u64 kvm_msr_reason(int r) { switch (r) { case KVM_MSR_RET_INVALID: return KVM_MSR_EXIT_REASON_UNKNOWN; case KVM_MSR_RET_FILTERED: return KVM_MSR_EXIT_REASON_FILTER; default: return KVM_MSR_EXIT_REASON_INVAL; } } static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u32 exit_reason, u64 data, int (*completion)(struct kvm_vcpu *vcpu), int r) { u64 msr_reason = kvm_msr_reason(r); /* Check if the user wanted to know about this MSR fault */ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) return 0; vcpu->run->exit_reason = exit_reason; vcpu->run->msr.error = 0; memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); vcpu->run->msr.reason = msr_reason; vcpu->run->msr.index = index; vcpu->run->msr.data = data; vcpu->arch.complete_userspace_io = completion; return 1; } static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) { return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, complete_emulated_rdmsr, r); } static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) { return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, complete_emulated_wrmsr, r); } int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; int r; r = kvm_get_msr(vcpu, ecx, &data); /* MSR read failed? See if we should ask user space */ if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { /* Bounce to user space */ return 0; } if (!r) { trace_kvm_msr_read(ecx, data); kvm_rax_write(vcpu, data & -1u); kvm_rdx_write(vcpu, (data >> 32) & -1u); } else { trace_kvm_msr_read_ex(ecx); } return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); int r; r = kvm_set_msr(vcpu, ecx, data); /* MSR write failed? See if we should ask user space */ if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) /* Bounce to user space */ return 0; /* Signal all other negative errors to userspace */ if (r < 0) return r; if (!r) trace_kvm_msr_write(ecx, data); else trace_kvm_msr_write_ex(ecx, data); return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_as_nop); int kvm_emulate_invd(struct kvm_vcpu *vcpu) { /* Treat an INVD instruction as a NOP and just skip it. */ return kvm_emulate_as_nop(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_invd); int kvm_emulate_mwait(struct kvm_vcpu *vcpu) { pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); return kvm_emulate_as_nop(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_mwait); int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); int kvm_emulate_monitor(struct kvm_vcpu *vcpu) { pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); return kvm_emulate_as_nop(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { xfer_to_guest_mode_prepare(); return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } /* * The fast path for frequent and performance sensitive wrmsr emulation, * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces * the latency of virtual IPI by avoiding the expensive bits of transitioning * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the * other cases which must be called after interrupts are enabled on the host. */ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) { if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) return 1; if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && ((u32)(data >> 32) != X2APIC_BROADCAST)) { data &= ~(1 << 12); kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); trace_kvm_apic_write(APIC_ICR, (u32)data); return 0; } return 1; } static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) { if (!kvm_can_use_hv_timer(vcpu)) return 1; kvm_set_lapic_tscdeadline_msr(vcpu, data); return 0; } fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); u64 data; fastpath_t ret = EXIT_FASTPATH_NONE; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { kvm_skip_emulated_instruction(vcpu); ret = EXIT_FASTPATH_EXIT_HANDLED; } break; case MSR_IA32_TSC_DEADLINE: data = kvm_read_edx_eax(vcpu); if (!handle_fastpath_set_tscdeadline(vcpu, data)) { kvm_skip_emulated_instruction(vcpu); ret = EXIT_FASTPATH_REENTER_GUEST; } break; default: break; } if (ret != EXIT_FASTPATH_NONE) trace_kvm_msr_write(msr, data); return ret; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); /* * Adapt set_msr() to msr_io()'s calling convention */ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { return kvm_get_msr_ignored_check(vcpu, index, data, true); } static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { return kvm_set_msr_ignored_check(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 struct pvclock_clock { int vclock_mode; u64 cycle_last; u64 mask; u32 mult; u32 shift; u64 base_cycles; u64 offset; }; struct pvclock_gtod_data { seqcount_t seq; struct pvclock_clock clock; /* extract of a clocksource struct */ struct pvclock_clock raw_clock; /* extract of a clocksource struct */ ktime_t offs_boot; u64 wall_time_sec; }; static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata->clock.cycle_last = tk->tkr_mono.cycle_last; vdata->clock.mask = tk->tkr_mono.mask; vdata->clock.mult = tk->tkr_mono.mult; vdata->clock.shift = tk->tkr_mono.shift; vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; vdata->clock.offset = tk->tkr_mono.base; vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; vdata->raw_clock.mask = tk->tkr_raw.mask; vdata->raw_clock.mult = tk->tkr_raw.mult; vdata->raw_clock.shift = tk->tkr_raw.shift; vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; vdata->raw_clock.offset = tk->tkr_raw.base; vdata->wall_time_sec = tk->xtime_sec; vdata->offs_boot = tk->offs_boot; write_seqcount_end(&vdata->seq); } static s64 get_kvmclock_base_ns(void) { /* Count up from boot time, but with the frequency of the raw clock. */ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); } #else static s64 get_kvmclock_base_ns(void) { /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ return ktime_get_boottime_ns(); } #endif void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) { int version; int r; struct pvclock_wall_clock wc; u32 wc_sec_hi; u64 wall_nsec; if (!wall_clock) return; r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); if (r) return; if (version & 1) ++version; /* first time write, random junk */ ++version; if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) return; /* * The guest calculates current wall clock time by adding * system time (updated by kvm_guest_time_update below) to the * wall clock specified here. We do the reverse here. */ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); wc.nsec = do_div(wall_nsec, 1000000000); wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version; kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); if (sec_hi_ofs) { wc_sec_hi = wall_nsec >> 32; kvm_write_guest(kvm, wall_clock + sec_hi_ofs, &wc_sec_hi, sizeof(wc_sec_hi)); } version++; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, bool old_msr, bool host_initiated) { struct kvm_arch *ka = &vcpu->kvm->arch; if (vcpu->vcpu_id == 0 && !host_initiated) { if (ka->boot_vcpu_runs_old_kvmclock != old_msr) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); ka->boot_vcpu_runs_old_kvmclock = old_msr; } vcpu->arch.time = system_time; kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ vcpu->arch.pv_time_enabled = false; if (!(system_time & 1)) return; if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, system_time & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = true; return; } static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { do_shl32_div32(dividend, divisor); return dividend; } static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, s8 *pshift, u32 *pmultiplier) { uint64_t scaled64; int32_t shift = 0; uint64_t tps64; uint32_t tps32; tps64 = base_hz; scaled64 = scaled_hz; while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { tps64 >>= 1; shift--; } tps32 = (uint32_t)tps64; while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) scaled64 >>= 1; else tps32 <<= 1; shift++; } *pshift = shift; *pmultiplier = div_frac(scaled64, tps32); } #ifdef CONFIG_X86_64 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); #endif static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); static unsigned long max_tsc_khz; static u32 adjust_tsc_khz(u32 khz, s32 ppm) { u64 v = (u64)khz * (1000000 + ppm); do_div(v, 1000000); return v; } static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier); static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { u64 ratio; /* Guest TSC same frequency as host TSC? */ if (!scale) { kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); return 0; } /* TSC scaling supported? */ if (!kvm_has_tsc_control) { if (user_tsc_khz > tsc_khz) { vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_always_catchup = 1; return 0; } else { pr_warn_ratelimited("user requested TSC rate below hardware speed\n"); return -1; } } /* TSC scaling required - calculate ratio */ ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, user_tsc_khz, tsc_khz); if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", user_tsc_khz); return -1; } kvm_vcpu_write_tsc_multiplier(vcpu, ratio); return 0; } static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) { u32 thresh_lo, thresh_hi; int use_scaling = 0; /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); return -1; } /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, &vcpu->arch.virtual_tsc_shift, &vcpu->arch.virtual_tsc_mult); vcpu->arch.virtual_tsc_khz = user_tsc_khz; /* * Compute the variation in TSC rate which is acceptable * within the range of tolerance and decide if the * rate being applied is within that bounds of the hardware * rate. If so, no scaling or compensation need be done. */ thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, vcpu->arch.virtual_tsc_mult, vcpu->arch.virtual_tsc_shift); tsc += vcpu->arch.this_tsc_write; return tsc; } static inline int gtod_is_based_on_tsc(int mode) { return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 bool vcpus_matched; struct kvm_arch *ka = &vcpu->kvm->arch; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&vcpu->kvm->online_vcpus)); /* * Once the masterclock is enabled, always perform request in * order to update it. * * In order to enable masterclock, the host clocksource must be TSC * and the vcpus need to have matched TSCs. When that happens, * perform request to enable masterclock. */ if (ka->use_master_clock || (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, atomic_read(&vcpu->kvm->online_vcpus), ka->use_master_clock, gtod->clock.vclock_mode); #endif } /* * Multiply tsc by a fixed point number represented by ratio. * * The most significant 64-N bits (mult) of ratio represent the * integral part of the fixed point number; the remaining N bits * (frac) represent the fractional part, ie. ratio represents a fixed * point number (mult + frac * 2^(-N)). * * N equals to kvm_tsc_scaling_ratio_frac_bits. */ static inline u64 __scale_tsc(u64 ratio, u64 tsc) { return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); } u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio) { u64 _tsc = tsc; if (ratio != kvm_default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); return _tsc; } EXPORT_SYMBOL_GPL(kvm_scale_tsc); static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); return target_tsc - tsc; } u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { u64 nested_offset; if (l2_multiplier == kvm_default_tsc_scaling_ratio) nested_offset = l1_offset; else nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, kvm_tsc_scaling_ratio_frac_bits); nested_offset += l2_offset; return nested_offset; } EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { if (l2_multiplier != kvm_default_tsc_scaling_ratio) return mul_u64_u64_shr(l1_multiplier, l2_multiplier, kvm_tsc_scaling_ratio_frac_bits); return l1_multiplier; } EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { trace_kvm_write_tsc_offset(vcpu->vcpu_id, vcpu->arch.l1_tsc_offset, l1_offset); vcpu->arch.l1_tsc_offset = l1_offset; /* * If we are here because L1 chose not to trap WRMSR to TSC then * according to the spec this should set L1's TSC (as opposed to * setting L1's offset for L2). */ if (is_guest_mode(vcpu)) vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( l1_offset, static_call(kvm_x86_get_l2_tsc_offset)(vcpu), static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); else vcpu->arch.tsc_offset = l1_offset; static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); } static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) { vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier; /* Userspace is changing the multiplier while L2 is active */ if (is_guest_mode(vcpu)) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( l1_multiplier, static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); else vcpu->arch.tsc_scaling_ratio = l1_multiplier; if (kvm_has_tsc_control) static_call(kvm_x86_write_tsc_multiplier)( vcpu, vcpu->arch.tsc_scaling_ratio); } static inline bool kvm_check_tsc_unstable(void) { #ifdef CONFIG_X86_64 /* * TSC is marked unstable when we're running on Hyper-V, * 'TSC page' clocksource is good. */ if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) return false; #endif return check_tsc_unstable(); } static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; bool matched; bool already_matched; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_l1_tsc_offset(vcpu, data); ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { if (data == 0) { /* * detection of vcpu initialization -- need to sync * with other vCPUs. This particularly helps to keep * kvm_clock stable after CPU hotplug */ synchronizing = true; } else { u64 tsc_exp = kvm->arch.last_tsc_write + nsec_to_cycles(vcpu, elapsed); u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; /* * Special case: TSC write with a small delta (1 second) * of virtual cycle time against real time is * interpreted as an attempt to synchronize the CPU. */ synchronizing = data < tsc_exp + tsc_hz && data + tsc_hz > tsc_exp; } } /* * For a reliable TSC, we can match TSC offsets, and for an unstable * TSC, we add elapsed time in this computation. We could let the * compensation code attempt to catch up if we fall behind, but * it's better to try to match offsets from the beginning. */ if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!kvm_check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; offset = kvm_compute_l1_tsc_offset(vcpu, data); } matched = true; already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); } else { /* * We split periods of matched TSC writes into generations. * For each generation, we track the original measured * nanosecond time, offset, and write, so if TSCs are in * sync, we can match exact offset, and if not, we can match * exact software computation in compute_guest_tsc() * * These values are tracked in kvm->arch.cur_xxx variables. */ kvm->arch.cur_tsc_generation++; kvm->arch.cur_tsc_nsec = ns; kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; matched = false; } /* * We also track th most recent recorded KHZ, write and time to * allow the matching interval to be extended at each write. */ kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; vcpu->arch.last_guest_tsc = data; /* Keep track of which generation this VCPU has synchronized to */ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); raw_spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; } else if (!already_matched) { kvm->arch.nr_vcpus_matched_tsc++; } kvm_track_tsc_matching(vcpu); raw_spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { u64 tsc_offset = vcpu->arch.l1_tsc_offset; kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc(vcpu, (u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); adjust_tsc_offset_guest(vcpu, adjustment); } #ifdef CONFIG_X86_64 static u64 read_tsc(void) { u64 ret = (u64)rdtsc_ordered(); u64 last = pvclock_gtod_data.clock.cycle_last; if (likely(ret >= last)) return ret; /* * GCC likes to generate cmov here, but this branch is extremely * predictable (it's just a function of time and the likely is * very likely) and there's a data dependence, so force GCC * to generate a branch instead. I don't barrier() because * we don't actually need a barrier, and if this function * ever gets inlined it will generate worse code. */ asm volatile (""); return last; } static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, int *mode) { long v; u64 tsc_pg_val; switch (clock->vclock_mode) { case VDSO_CLOCKMODE_HVCLOCK: tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), tsc_timestamp); if (tsc_pg_val != U64_MAX) { /* TSC page valid */ *mode = VDSO_CLOCKMODE_HVCLOCK; v = (tsc_pg_val - clock->cycle_last) & clock->mask; } else { /* TSC page invalid */ *mode = VDSO_CLOCKMODE_NONE; } break; case VDSO_CLOCKMODE_TSC: *mode = VDSO_CLOCKMODE_TSC; *tsc_timestamp = read_tsc(); v = (*tsc_timestamp - clock->cycle_last) & clock->mask; break; default: *mode = VDSO_CLOCKMODE_NONE; } if (*mode == VDSO_CLOCKMODE_NONE) *tsc_timestamp = v = 0; return v * clock->mult; } static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; int mode; u64 ns; do { seq = read_seqcount_begin(&gtod->seq); ns = gtod->raw_clock.base_cycles; ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode); ns >>= gtod->raw_clock.shift; ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); *t = ns; return mode; } static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; int mode; u64 ns; do { seq = read_seqcount_begin(&gtod->seq); ts->tv_sec = gtod->wall_time_sec; ns = gtod->clock.base_cycles; ns += vgettsc(&gtod->clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); ts->tv_nsec = ns; return mode; } /* returns true if host is using TSC based clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { /* checked again under seqlock below */ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, tsc_timestamp)); } /* returns true if host is using TSC based clocksource */ static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, u64 *tsc_timestamp) { /* checked again under seqlock below */ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp)); } #endif /* * * Assuming a stable TSC across physical CPUS, and a stable TSC * across virtual CPUs, the following condition is possible. * Each numbered line represents an event visible to both * CPUs at the next numbered event. * * "timespecX" represents host monotonic time. "tscX" represents * RDTSC value. * * VCPU0 on CPU0 | VCPU1 on CPU1 * * 1. read timespec0,tsc0 * 2. | timespec1 = timespec0 + N * | tsc1 = tsc0 + M * 3. transition to guest | transition to guest * 4. ret0 = timespec0 + (rdtsc - tsc0) | * 5. | ret1 = timespec1 + (rdtsc - tsc1) * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) * * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: * * - ret0 < ret1 * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) * ... * - 0 < N - M => M < N * * That is, when timespec0 != timespec1, M < N. Unfortunately that is not * always the case (the difference between two distinct xtime instances * might be smaller then the difference between corresponding TSC reads, * when updating guest vcpus pvclock areas). * * To avoid that problem, do not allow visibility of distinct * system_timestamp/tsc_timestamp values simultaneously: use a master * copy of host monotonic time values. Update that master copy * in lockstep. * * Rely on synchronization of host TSCs and guest TSCs for monotonicity. * */ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) { #ifdef CONFIG_X86_64 struct kvm_arch *ka = &kvm->arch; int vclock_mode; bool host_tsc_clocksource, vcpus_matched; vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus)); /* * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ host_tsc_clocksource = kvm_get_time_and_clockread( &ka->master_kernel_ns, &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched && !ka->backwards_tsc_observed && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); vclock_mode = pvclock_gtod_data.clock.vclock_mode; trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, vcpus_matched); #endif } void kvm_make_mclock_inprogress_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } static void kvm_gen_update_masterclock(struct kvm *kvm) { #ifdef CONFIG_X86_64 int i; struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; unsigned long flags; kvm_hv_invalidate_tsc_page(kvm); kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); #endif } u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; unsigned long flags; u64 ret; raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); if (!ka->use_master_clock) { raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); if (__this_cpu_read(cpu_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else ret = get_kvmclock_base_ns() + ka->kvmclock_offset; put_cpu(); return ret; } static void kvm_setup_pvclock_page(struct kvm_vcpu *v, struct gfn_to_hva_cache *cache, unsigned int offset) { struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache, &guest_hv_clock, offset, sizeof(guest_hv_clock)))) return; /* This VCPU is paused, but it's legal for a guest to read another * VCPU's kvmclock, so we really have to follow the specification where * it says that version is odd if data is being modified, and even after * it is consistent. * * Version field updates must be kept separate. This is because * kvm_write_guest_cached might use a "rep movs" instruction, and * writes within a string instruction are weakly ordered. So there * are three writes overall. * * As a small optimization, only write the version field in the first * and third write. The vcpu->pv_time cache is still valid, because the * version field is the first in the struct. */ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); if (guest_hv_clock.version & 1) ++guest_hv_clock.version; /* first time write, random junk */ vcpu->hv_clock.version = guest_hv_clock.version + 1; kvm_write_guest_offset_cached(v->kvm, cache, &vcpu->hv_clock, offset, sizeof(vcpu->hv_clock.version)); smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); if (vcpu->pvclock_set_guest_stopped_request) { vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; vcpu->pvclock_set_guest_stopped_request = false; } trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); kvm_write_guest_offset_cached(v->kvm, cache, &vcpu->hv_clock, offset, sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; kvm_write_guest_offset_cached(v->kvm, cache, &vcpu->hv_clock, offset, sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, tgt_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; u8 pvclock_flags; bool use_master_clock; kernel_ns = 0; host_tsc = 0; /* * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ raw_spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } raw_spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); return 1; } if (!use_master_clock) { host_tsc = rdtsc(); kernel_ns = get_kvmclock_base_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); /* * We may have to catch up the TSC to match elapsed wall clock * time for two reasons, even if kvmclock is used. * 1) CPU could have been running below the maximum TSC rate * 2) Broken TSC compensation resets the base at each VCPU * entry to avoid unknown leaps of TSC even when running * again on the same CPU. This may cause apparent elapsed * time to disappear, and the guest to stand still or run * very slowly. */ if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); if (tsc > tsc_timestamp) { adjust_tsc_offset_guest(v, tsc - tsc_timestamp); tsc_timestamp = tsc; } } local_irq_restore(flags); /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; } vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; /* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; vcpu->hv_clock.flags = pvclock_flags; if (vcpu->pv_time_enabled) kvm_setup_pvclock_page(v, &vcpu->pv_time, 0); if (vcpu->xen.vcpu_info_set) kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache, offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_set) kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); if (!v->vcpu_idx) kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } /* * kvmclock updates which are isolated to a given vcpu, such as * vcpu->cpu migration, should not allow system_timestamp from * the rest of the vcpus to remain static. Otherwise ntp frequency * correction applies to one vcpu's system_timestamp but not * the others. * * So in those cases, request a kvmclock update for all vcpus. * We need to rate-limit these requests though, as they can * considerably slow guests that have a large number of vcpus. * The time for a remote vcpu to update its kvmclock is bound * by the delay we use to rate-limit the updates. */ #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) static void kvmclock_update_fn(struct work_struct *work) { int i; struct delayed_work *dwork = to_delayed_work(work); struct kvm_arch *ka = container_of(dwork, struct kvm_arch, kvmclock_update_work); struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_vcpu_kick(vcpu); } } static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { struct kvm *kvm = v->kvm; kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); schedule_delayed_work(&kvm->arch.kvmclock_update_work, KVMCLOCK_UPDATE_DELAY); } #define KVMCLOCK_SYNC_PERIOD (300 * HZ) static void kvmclock_sync_fn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct kvm_arch *ka = container_of(dwork, struct kvm_arch, kvmclock_sync_work); struct kvm *kvm = container_of(ka, struct kvm, arch); if (!kvmclock_periodic_sync) return; schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); } /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ if (guest_cpuid_is_amd_compatible(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; } static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; switch (msr) { case MSR_IA32_MCG_STATUS: vcpu->arch.mcg_status = data; break; case MSR_IA32_MCG_CTL: if (!(mcg_cap & MCG_CTL_P) && (data || !msr_info->host_initiated)) return 1; if (data != 0 && data != ~(u64)0) return 1; vcpu->arch.mcg_ctl = data; break; default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { u32 offset = array_index_nospec( msr - MSR_IA32_MC0_CTL, MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to * workaround a BIOS/GART TBL issue on AMD K8s, ignore * this to avoid an uncatched #GP in the guest. * * UNIXWARE clears bit 0 of MC1_CTL to ignore * correctable, single-bit ECC data errors. */ if ((offset & 0x3) == 0 && data != 0 && (data | (1 << 10) | 1) != ~(u64)0) return 1; /* MCi_STATUS */ if (!msr_info->host_initiated && (offset & 0x3) == 1 && data != 0) { if (!can_set_mci_status(vcpu)) return 1; } vcpu->arch.mce_banks[offset] = data; break; } return 1; } return 0; } static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) { u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; return (vcpu->arch.apf.msr_en_val & mask) == mask; } static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; /* Bits 4:5 are reserved, Should be zero */ if (data & 0x30) return 1; if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) return 1; if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) return 1; if (!lapic_in_kernel(vcpu)) return data ? 1 : 0; vcpu->arch.apf.msr_en_val = data; if (!kvm_pv_async_pf_enabled(vcpu)) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); return 0; } if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, sizeof(u64))) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); return 0; } static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) { /* Bits 8-63 are reserved */ if (data >> 8) return 1; if (!lapic_in_kernel(vcpu)) return 1; vcpu->arch.apf.msr_int_val = data; vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; return 0; } static void kvmclock_reset(struct kvm_vcpu *vcpu) { vcpu->arch.pv_time_enabled = false; vcpu->arch.time = 0; } static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; static_call(kvm_x86_tlb_flush_all)(vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; if (!tdp_enabled) { /* * A TLB flush on behalf of the guest is equivalent to * INVPCID(all), toggling CR4.PGE, etc., which requires * a forced sync of the shadow page tables. Unload the * entire MMU here and the subsequent load will sync the * shadow page tables, and also flush the TLB. */ kvm_mmu_unload(vcpu); return; } static_call(kvm_x86_tlb_flush_guest)(vcpu); } static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; static_call(kvm_x86_tlb_flush_current)(vcpu); } /* * Service "local" TLB flush requests, which are specific to the current MMU * context. In addition to the generic event handling in vcpu_enter_guest(), * TLB flushes that are targeted at an MMU context also need to be serviced * prior before nested VM-Enter/VM-Exit. */ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) { if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); } EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); static void record_steal_time(struct kvm_vcpu *vcpu) { struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; struct kvm_steal_time __user *st; struct kvm_memslots *slots; gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; u64 steal; u32 version; if (kvm_xen_msr_enabled(vcpu->kvm)) { kvm_xen_runstate_set_running(vcpu); return; } if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) return; slots = kvm_memslots(vcpu->kvm); if (unlikely(slots->generation != ghc->generation || gpa != ghc->gpa || kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { /* We rely on the fact that it fits in a single page. */ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) || kvm_is_error_hva(ghc->hva) || !ghc->memslot) return; } st = (struct kvm_steal_time __user *)ghc->hva; /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { u8 st_preempted = 0; int err = -EFAULT; if (!user_access_begin(st, sizeof(*st))) return; asm volatile("1: xchgb %0, %2\n" "xor %1, %1\n" "2:\n" _ASM_EXTABLE_UA(1b, 2b) : "+q" (st_preempted), "+&r" (err), "+m" (st->preempted)); if (err) goto out; user_access_end(); vcpu->arch.st.preempted = 0; trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st_preempted & KVM_VCPU_FLUSH_TLB); if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); if (!user_access_begin(st, sizeof(*st))) goto dirty; } else { if (!user_access_begin(st, sizeof(*st))) return; unsafe_put_user(0, &st->preempted, out); vcpu->arch.st.preempted = 0; } unsafe_get_user(version, &st->version, out); if (version & 1) version += 1; /* first time write, random junk */ version += 1; unsafe_put_user(version, &st->version, out); smp_wmb(); unsafe_get_user(steal, &st->steal, out); steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; unsafe_put_user(steal, &st->steal, out); version += 1; unsafe_put_user(version, &st->version, out); out: user_access_end(); dirty: mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { bool pr = false; u32 msr = msr_info->index; u64 data = msr_info->data; if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) return kvm_xen_write_hypercall_page(vcpu, data); switch (msr) { case MSR_AMD64_NB_CFG: case MSR_IA32_UCODE_WRITE: case MSR_VM_HSAVE_PA: case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: case MSR_AMD64_DC_CFG: case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: break; case MSR_IA32_UCODE_REV: if (msr_info->host_initiated) vcpu->arch.microcode_version = data; break; case MSR_IA32_ARCH_CAPABILITIES: if (!msr_info->host_initiated) return 1; vcpu->arch.arch_capabilities = data; break; case MSR_IA32_PERF_CAPABILITIES: { struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; if (!msr_info->host_initiated) return 1; if (kvm_get_msr_feature(&msr_ent)) return 1; if (data & ~msr_ent.data) return 1; vcpu->arch.perf_capabilities = data; return 0; } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ /* Handle McStatusWrEn */ if (data == BIT_ULL(18)) { vcpu->arch.msr_hwcr = data; } else if (data != 0) { vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " "0x%llx\n", data); return 1; } break; case 0x200 ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSC_DEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); /* Before back to guest, tsc_timestamp must be adjusted * as well, otherwise guest's percpu pvclock time could jump. */ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } break; case MSR_IA32_MISC_ENABLE: if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; kvm_update_cpuid_runtime(vcpu); } else { vcpu->arch.ia32_misc_enable_msr = data; } break; case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; vcpu->arch.smbase = data; break; case MSR_IA32_POWER_CTL: vcpu->arch.msr_ia32_power_ctl = data; break; case MSR_IA32_TSC: if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, data); } else { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; } break; case MSR_IA32_XSS: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return 1; /* * KVM supports exposing PT to the guest, but does not support * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ if (data & ~supported_xss) return 1; vcpu->arch.ia32_xss = data; kvm_update_cpuid_runtime(vcpu); break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) return 1; vcpu->arch.smi_count = data; break; case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); break; case MSR_KVM_SYSTEM_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; case MSR_KVM_ASYNC_PF_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) return 1; if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; kvm_check_async_pf_completion(vcpu); } break; case MSR_KVM_STEAL_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) return 1; if (unlikely(!sched_info_on())) return 1; if (data & KVM_STEAL_RESERVED_MASK) return 1; vcpu->arch.st.msr_val = data; if (!(data & KVM_MSR_ENABLED)) break; kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; case MSR_KVM_PV_EOI_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) return 1; if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; case MSR_KVM_POLL_CONTROL: if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) return 1; /* only enable bit supported */ if (data & (-1ULL << 1)) return 1; vcpu->arch.msr_kvm_poll_control = data; break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: pr = true; fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; case MSR_K7_CLK_CTL: /* * Ignore all writes to this no longer documented MSR. * Writes are only relevant for old K7 processors, * all pre-dating SVM, but a recommended workaround from * AMD for these chips. It is possible to specify the * affected processor models on the command line, hence * the need to ignore the workaround. */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ if (report_ignored_msrs) vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.length = data; break; case MSR_AMD64_OSVW_STATUS: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.status = data; break; case MSR_PLATFORM_INFO: if (!msr_info->host_initiated || (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && cpuid_fault_enabled(vcpu))) return 1; vcpu->arch.msr_platform_info = data; break; case MSR_MISC_FEATURES_ENABLES: if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && !supports_cpuid_fault(vcpu))) return 1; vcpu->arch.msr_misc_features_enables = data; break; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } return 0; } EXPORT_SYMBOL_GPL(kvm_set_msr_common); static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; switch (msr) { case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: data = 0; break; case MSR_IA32_MCG_CAP: data = vcpu->arch.mcg_cap; break; case MSR_IA32_MCG_CTL: if (!(mcg_cap & MCG_CTL_P) && !host) return 1; data = vcpu->arch.mcg_ctl; break; case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { u32 offset = array_index_nospec( msr - MSR_IA32_MC0_CTL, MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); data = vcpu->arch.mce_banks[offset]; break; } return 1; } *pdata = data; return 0; } int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_LASTBRANCHFROMIP: case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: case MSR_AMD64_SYSCFG: case MSR_K8_TSEG_ADDR: case MSR_K8_TSEG_MASK: case MSR_VM_HSAVE_PA: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: /* * Intel Sandy Bridge CPUs must support the RAPL (running average power * limit) MSRs. Just return 0, as we do not want to expose the host * data here. Do not conditionalize this on CPUID, as KVM does not do * so for existing CPU-specific MSRs. */ case MSR_RAPL_POWER_UNIT: case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ case MSR_PKG_ENERGY_STATUS: /* Total package */ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); if (!msr_info->host_initiated) return 1; msr_info->data = 0; break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; case MSR_IA32_UCODE_REV: msr_info->data = vcpu->arch.microcode_version; break; case MSR_IA32_ARCH_CAPABILITIES: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; case MSR_IA32_PERF_CAPABILITIES: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) return 1; msr_info->data = vcpu->arch.perf_capabilities; break; case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; case MSR_IA32_TSC: { /* * Intel SDM states that MSR_IA32_TSC read adds the TSC offset * even when not intercepted. AMD manual doesn't explicitly * state this but appears to behave the same. * * On userspace reads and writes, however, we unconditionally * return L1's TSC value to ensure backwards-compatible * behavior for migration. */ u64 offset, ratio; if (msr_info->host_initiated) { offset = vcpu->arch.l1_tsc_offset; ratio = vcpu->arch.l1_tsc_scaling_ratio; } else { offset = vcpu->arch.tsc_offset; ratio = vcpu->arch.tsc_scaling_ratio; } msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset; break; } case MSR_MTRRcap: case 0x200 ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; break; /* * MSR_EBC_FREQUENCY_ID * Conservative value valid for even the basic CPU models. * Models 0,1: 000 in bits 23:21 indicating a bus speed of * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, * and 266MHz for model 3, or 4. Set Core Clock * Frequency to System Bus Frequency Ratio to 1 (bits * 31:24) even though these are only valid for CPU * models > 2, however guests may end up dividing or * multiplying by zero otherwise. */ case MSR_EBC_FREQUENCY_ID: msr_info->data = 1 << 24; break; case MSR_IA32_APICBASE: msr_info->data = kvm_get_apic_base(vcpu); break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); case MSR_IA32_TSC_DEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; break; case MSR_IA32_MISC_ENABLE: msr_info->data = vcpu->arch.ia32_misc_enable_msr; break; case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; msr_info->data = vcpu->arch.smbase; break; case MSR_SMI_COUNT: msr_info->data = vcpu->arch.smi_count; break; case MSR_IA32_PERF_STATUS: /* TSC increment by tick */ msr_info->data = 1000ULL; /* CPU multiplier */ msr_info->data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: msr_info->data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; msr_info->data = vcpu->arch.time; break; case MSR_KVM_SYSTEM_TIME_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) return 1; msr_info->data = vcpu->arch.apf.msr_en_val; break; case MSR_KVM_ASYNC_PF_INT: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) return 1; msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) return 1; msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_KVM_POLL_CONTROL: if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) return 1; msr_info->data = vcpu->arch.msr_kvm_poll_control; break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_XSS: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return 1; msr_info->data = vcpu->arch.ia32_xss; break; case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other * are set to zero, indicating minimum divisors for * every field. * * This prevents guest kernels on AMD host with CPU * type 6, model 8 and higher from exploding due to * the rdmsr failing. */ msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow * scenarios where it sets bit #19, itself documented as * a "reserved" bit. Best effort attempt to source coherent * read data here should the balance of the register be * interpreted by the guest: * * L2 cache control register 3: 64GB range, 256KB size, * enabled, latency 0x1, configured */ msr_info->data = 0xbe702111; break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.length; break; case MSR_AMD64_OSVW_STATUS: if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.status; break; case MSR_PLATFORM_INFO: if (!msr_info->host_initiated && !vcpu->kvm->arch.guest_can_read_msr_platform_info) return 1; msr_info->data = vcpu->arch.msr_platform_info; break; case MSR_MISC_FEATURES_ENABLES: msr_info->data = vcpu->arch.msr_misc_features_enables; break; case MSR_K7_HWCR: msr_info->data = vcpu->arch.msr_hwcr; break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); return KVM_MSR_RET_INVALID; } return 0; } EXPORT_SYMBOL_GPL(kvm_get_msr_common); /* * Read or write a bunch of msrs. All parameters are kernel addresses. * * @return number of msrs set successfully. */ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, struct kvm_msr_entry *entries, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { int i; for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; return i; } /* * Read or write a bunch of msrs. Parameters are user addresses. * * @return number of msrs set successfully. */ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data), int writeback) { struct kvm_msrs msrs; struct kvm_msr_entry *entries; int r, n; unsigned size; r = -EFAULT; if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) goto out; r = -E2BIG; if (msrs.nmsrs >= MAX_IO_MSRS) goto out; size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; entries = memdup_user(user_msrs->entries, size); if (IS_ERR(entries)) { r = PTR_ERR(entries); goto out; } r = n = __msr_io(vcpu, &msrs, entries, do_msr); if (r < 0) goto out_free; r = -EFAULT; if (writeback && copy_to_user(user_msrs->entries, entries, size)) goto out_free; r = n; out_free: kfree(entries); out: return r; } static inline bool kvm_can_mwait_in_guest(void) { return boot_cpu_has(X86_FEATURE_MWAIT) && !boot_cpu_has_bug(X86_BUG_MONITOR) && boot_cpu_has(X86_FEATURE_ARAT); } static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 __user *cpuid_arg) { struct kvm_cpuid2 cpuid; int r; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) return r; r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries); if (r) return r; r = -EFAULT; if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) return r; return 0; } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; switch (ext) { case KVM_CAP_IRQCHIP: case KVM_CAP_HLT: case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_HYPERV_SYNIC: case KVM_CAP_HYPERV_SYNIC2: case KVM_CAP_HYPERV_VP_INDEX: case KVM_CAP_HYPERV_EVENTFD: case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: case KVM_CAP_HYPERV_CPUID: case KVM_CAP_HYPERV_ENFORCE_CPUID: case KVM_CAP_SYS_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_PMU_EVENT_FILTER: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: case KVM_CAP_X86_MSR_FILTER: case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: #ifdef CONFIG_X86_SGX_KVM case KVM_CAP_SGX_ATTRIBUTE: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: r = KVM_EXIT_HYPERCALL_VALID_MASK; break; case KVM_CAP_SET_GUEST_DEBUG2: return KVM_GUESTDBG_VALID_MASK; #ifdef CONFIG_KVM_XEN case KVM_CAP_XEN_HVM: r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE; break; #endif case KVM_CAP_SYNC_REGS: r = KVM_SYNC_X86_VALID_FIELDS; break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_TSC_STABLE; break; case KVM_CAP_X86_DISABLE_EXITS: r = KVM_X86_DISABLE_EXITS_PAUSE; if (!mitigate_smt_rsb) { r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_CSTATE; if (kvm_can_mwait_in_guest()) r |= KVM_X86_DISABLE_EXITS_MWAIT; } break; case KVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, * so do not report SMM to be available if real mode is * emulated via vm86 mode. Still, do not go to great lengths * to avoid userspace's usage of the feature, because it is a * fringe case that is not enabled except via specific settings * of the module parameters. */ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; case KVM_CAP_VAPIC: r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); break; case KVM_CAP_NR_VCPUS: r = KVM_SOFT_MAX_VCPUS; break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: r = KVM_MAX_VCPU_ID; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; case KVM_CAP_XCRS: r = boot_cpu_has(X86_FEATURE_XSAVE); break; case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: r = kvm_x86_ops.nested_ops->get_state ? kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: r = kvm_x86_ops.enable_direct_tlbflush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; case KVM_CAP_SMALLER_MAXPHYADDR: r = (int) allow_smaller_maxphyaddr; break; case KVM_CAP_STEAL_TIME: r = sched_info_on(); break; case KVM_CAP_X86_BUS_LOCK_EXIT: if (kvm_has_bus_lock_exit) r = KVM_BUS_LOCK_DETECTION_OFF | KVM_BUS_LOCK_DETECTION_EXIT; else r = 0; break; default: break; } return r; } long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; long r; switch (ioctl) { case KVM_GET_MSR_INDEX_LIST: { struct kvm_msr_list __user *user_msr_list = argp; struct kvm_msr_list msr_list; unsigned n; r = -EFAULT; if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) goto out; n = msr_list.nmsrs; msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) goto out; r = -E2BIG; if (n < msr_list.nmsrs) goto out; r = -EFAULT; if (copy_to_user(user_msr_list->indices, &msrs_to_save, num_msrs_to_save * sizeof(u32))) goto out; if (copy_to_user(user_msr_list->indices + num_msrs_to_save, &emulated_msrs, num_emulated_msrs * sizeof(u32))) goto out; r = 0; break; } case KVM_GET_SUPPORTED_CPUID: case KVM_GET_EMULATED_CPUID: { struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, ioctl); if (r) goto out; r = -EFAULT; if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) goto out; r = 0; break; } case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; if (copy_to_user(argp, &kvm_mce_cap_supported, sizeof(kvm_mce_cap_supported))) goto out; r = 0; break; case KVM_GET_MSR_FEATURE_INDEX_LIST: { struct kvm_msr_list __user *user_msr_list = argp; struct kvm_msr_list msr_list; unsigned int n; r = -EFAULT; if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) goto out; n = msr_list.nmsrs; msr_list.nmsrs = num_msr_based_features; if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) goto out; r = -E2BIG; if (n < msr_list.nmsrs) goto out; r = -EFAULT; if (copy_to_user(user_msr_list->indices, &msr_based_features, num_msr_based_features * sizeof(u32))) goto out; r = 0; break; } case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); break; default: r = -EINVAL; break; } out: return r; } static void wbinvd_ipi(void *garbage) { wbinvd(); } static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) { return kvm_arch_has_noncoherent_dma(vcpu->kvm); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { if (static_call(kvm_x86_has_wbinvd_exit)()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) smp_call_function_single(vcpu->cpu, wbinvd_ipi, NULL, 1); } static_call(kvm_x86_vcpu_load)(vcpu, cpu); /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); vcpu->arch.tsc_offset_adjustment = 0; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : rdtsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (kvm_check_tsc_unstable()) { u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } if (kvm_lapic_hv_timer_in_use(vcpu)) kvm_lapic_restart_hv_timer(vcpu); /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration */ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); vcpu->cpu = cpu; } kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; struct kvm_steal_time __user *st; struct kvm_memslots *slots; static const u8 preempted = KVM_VCPU_PREEMPTED; gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; /* * The vCPU can be marked preempted if and only if the VM-Exit was on * an instruction boundary and will not trigger guest emulation of any * kind (see vcpu_run). Vendor specific code controls (conservatively) * when this is true, for example allowing the vCPU to be marked * preempted if and only if the VM-Exit was due to a host interrupt. */ if (!vcpu->arch.at_instruction_boundary) { vcpu->stat.preemption_other++; return; } vcpu->stat.preemption_reported++; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; if (vcpu->arch.st.preempted) return; /* This happens on process exit */ if (unlikely(current->mm != vcpu->kvm->mm)) return; slots = kvm_memslots(vcpu->kvm); if (unlikely(slots->generation != ghc->generation || gpa != ghc->gpa || kvm_is_error_hva(ghc->hva) || !ghc->memslot)) return; st = (struct kvm_steal_time __user *)ghc->hva; BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { int idx; if (vcpu->preempted) { if (!vcpu->arch.guest_state_protected) vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); /* * Take the srcu lock as memslots will be accessed to check the gfn * cache generation against the memslots generation. */ idx = srcu_read_lock(&vcpu->kvm->srcu); if (kvm_xen_msr_enabled(vcpu->kvm)) kvm_xen_runstate_set_preempted(vcpu); else kvm_steal_time_set_preempted(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); } static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { int r; r = kvm_apic_set_state(vcpu, s); if (r) return r; update_cr8_intercept(vcpu); return 0; } static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) { /* * We can accept userspace's request for interrupt injection * as long as we have a place to store the interrupt number. * The actual injection will happen when the CPU is able to * deliver the interrupt. */ if (kvm_cpu_has_extint(vcpu)) return false; /* Acknowledging ExtINT does not happen if LINT0 is masked. */ return (!lapic_in_kernel(vcpu) || kvm_apic_accept_pic_intr(vcpu)); } static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) { /* * Do not cause an interrupt window exit if an exception * is pending or an event needs reinjection; userspace * might want to inject the interrupt manually using KVM_SET_REGS * or KVM_SET_SREGS. For that to work, we must be at an * instruction boundary and with no events half-injected. */ return (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_accept_dm_intr(vcpu) && !kvm_event_needs_reinjection(vcpu) && !vcpu->arch.exception.pending); } static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { if (irq->irq >= KVM_NR_INTERRUPTS) return -EINVAL; if (!irqchip_in_kernel(vcpu->kvm)) { kvm_queue_interrupt(vcpu, irq->irq, false); kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } /* * With in-kernel LAPIC, we only use this to inject EXTINT, so * fail for in-kernel 8259. */ if (pic_in_kernel(vcpu->kvm)) return -ENXIO; if (vcpu->arch.pending_external_vector != -1) return -EEXIST; vcpu->arch.pending_external_vector = irq->irq; kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) { kvm_inject_nmi(vcpu); return 0; } static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) { kvm_make_request(KVM_REQ_SMI, vcpu); return 0; } static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { if (tac->flags) return -EINVAL; vcpu->arch.tpr_access_reporting = !!tac->enabled; return 0; } static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, u64 mcg_cap) { int r; unsigned bank_num = mcg_cap & 0xff, bank; r = -EINVAL; if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; /* Init IA32_MCG_CTL to all 1s */ if (mcg_cap & MCG_CTL_P) vcpu->arch.mcg_ctl = ~(u64)0; /* Init IA32_MCi_CTL to all 1s */ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; static_call(kvm_x86_setup_mce)(vcpu); out: return r; } static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce) { u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; u64 *banks = vcpu->arch.mce_banks; if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) return -EINVAL; /* * if IA32_MCG_CTL is not all 1s, the uncorrected error * reporting is disabled */ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && vcpu->arch.mcg_ctl != ~(u64)0) return 0; banks += 4 * mce->bank; /* * if IA32_MCi_CTL is not all 1s, the uncorrected error * reporting is disabled for the bank */ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } if (banks[1] & MCI_STATUS_VAL) mce->status |= MCI_STATUS_OVER; banks[2] = mce->addr; banks[3] = mce->misc; vcpu->arch.mcg_status = mce->mcg_status; banks[1] = mce->status; kvm_queue_exception(vcpu, MC_VECTOR); } else if (!(banks[1] & MCI_STATUS_VAL) || !(banks[1] & MCI_STATUS_UC)) { if (banks[1] & MCI_STATUS_VAL) mce->status |= MCI_STATUS_OVER; banks[2] = mce->addr; banks[3] = mce->misc; banks[1] = mce->status; } else banks[1] |= MCI_STATUS_OVER; return 0; } static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { process_nmi(vcpu); if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); /* * In guest mode, payload delivery should be deferred, * so that the L1 hypervisor can intercept #PF before * CR2 is modified (or intercept #DB before DR6 is * modified under nVMX). Unless the per-VM capability, * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we * opportunistically defer the exception payload, deliver it if the * capability hasn't been requested before processing a * KVM_GET_VCPU_EVENTS. */ if (!vcpu->kvm->arch.exception_payload_enabled && vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) kvm_deliver_exception_payload(vcpu); /* * The API doesn't provide the instruction length for software * exceptions, so don't report them. As long as the guest RIP * isn't advanced, we should expect to encounter the exception * again. */ if (kvm_exception_is_soft(vcpu->arch.exception.nr)) { events->exception.injected = 0; events->exception.pending = 0; } else { events->exception.injected = vcpu->arch.exception.injected; events->exception.pending = vcpu->arch.exception.pending; /* * For ABI compatibility, deliberately conflate * pending and injected exceptions when * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled. */ if (!vcpu->kvm->arch.exception_payload_enabled) events->exception.injected |= vcpu->arch.exception.pending; } events->exception.nr = vcpu->arch.exception.nr; events->exception.has_error_code = vcpu->arch.exception.has_error_code; events->exception.error_code = vcpu->arch.exception.error_code; events->exception_has_payload = vcpu->arch.exception.has_payload; events->exception_payload = vcpu->arch.exception.payload; events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); events->nmi.pad = 0; events->sipi_vector = 0; /* never valid when reporting to user space */ events->smi.smm = is_smm(vcpu); events->smi.pending = vcpu->arch.smi_pending; events->smi.smm_inside_nmi = !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); events->smi.latched_init = kvm_lapic_latched_init(vcpu); events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM); if (vcpu->kvm->arch.exception_payload_enabled) events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; memset(&events->reserved, 0, sizeof(events->reserved)); } static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm); static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM | KVM_VCPUEVENT_VALID_PAYLOAD)) return -EINVAL; if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { if (!vcpu->kvm->arch.exception_payload_enabled) return -EINVAL; if (events->exception.pending) events->exception.injected = 0; else events->exception_has_payload = 0; } else { events->exception.pending = 0; events->exception_has_payload = 0; } if ((events->exception.injected || events->exception.pending) && (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) return -EINVAL; /* INITs are latched while in SMM */ if (events->flags & KVM_VCPUEVENT_VALID_SMM && (events->smi.smm || events->smi.pending) && vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) return -EINVAL; process_nmi(vcpu); vcpu->arch.exception.injected = events->exception.injected; vcpu->arch.exception.pending = events->exception.pending; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; vcpu->arch.exception.error_code = events->exception.error_code; vcpu->arch.exception.has_payload = events->exception_has_payload; vcpu->arch.exception.payload = events->exception_payload; vcpu->arch.interrupt.injected = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) static_call(kvm_x86_set_interrupt_shadow)(vcpu, events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) vcpu->arch.nmi_pending = events->nmi.pending; static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { kvm_leave_nested(vcpu); kvm_smm_changed(vcpu, events->smi.smm); } vcpu->arch.smi_pending = events->smi.pending; if (events->smi.smm) { if (events->smi.smm_inside_nmi) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; } if (lapic_in_kernel(vcpu)) { if (events->smi.latched_init) set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); else clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } } kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { unsigned long val; memset(dbgregs, 0, sizeof(*dbgregs)); memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); kvm_get_dr(vcpu, 6, &val); dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { if (dbgregs->flags) return -EINVAL; if (!kvm_dr6_valid(dbgregs->dr6)) return -EINVAL; if (!kvm_dr7_valid(dbgregs->dr7)) return -EINVAL; memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; vcpu->arch.dr7 = dbgregs->dr7; kvm_update_dr7(vcpu); return 0; } #define XSTATE_COMPACTION_ENABLED (1ULL << 63) static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; u64 xstate_bv = xsave->header.xfeatures; u64 valid; /* * Copy legacy XSAVE area, to avoid complications with CPUID * leaves 0 and 1 in the loop below. */ memcpy(dest, xsave, XSAVE_HDR_OFFSET); /* Set XSTATE_BV */ xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; /* * Copy each region from the possibly compacted offset to the * non-compacted offset. */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { u32 size, offset, ecx, edx; u64 xfeature_mask = valid & -valid; int xfeature_nr = fls64(xfeature_mask) - 1; void *src; cpuid_count(XSTATE_CPUID, xfeature_nr, &size, &offset, &ecx, &edx); if (xfeature_nr == XFEATURE_PKRU) { memcpy(dest + offset, &vcpu->arch.pkru, sizeof(vcpu->arch.pkru)); } else { src = get_xsave_addr(xsave, xfeature_nr); if (src) memcpy(dest + offset, src, size); } valid -= xfeature_mask; } } static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) { struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); u64 valid; /* * Copy legacy XSAVE area, to avoid complications with CPUID * leaves 0 and 1 in the loop below. */ memcpy(xsave, src, XSAVE_HDR_OFFSET); /* Set XSTATE_BV and possibly XCOMP_BV. */ xsave->header.xfeatures = xstate_bv; if (boot_cpu_has(X86_FEATURE_XSAVES)) xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* * Copy each region from the non-compacted offset to the * possibly compacted offset. */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { u32 size, offset, ecx, edx; u64 xfeature_mask = valid & -valid; int xfeature_nr = fls64(xfeature_mask) - 1; cpuid_count(XSTATE_CPUID, xfeature_nr, &size, &offset, &ecx, &edx); if (xfeature_nr == XFEATURE_PKRU) { memcpy(&vcpu->arch.pkru, src + offset, sizeof(vcpu->arch.pkru)); } else { void *dest = get_xsave_addr(xsave, xfeature_nr); if (dest) memcpy(dest, src + offset, size); } valid -= xfeature_mask; } } static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { if (!vcpu->arch.guest_fpu) return; if (boot_cpu_has(X86_FEATURE_XSAVE)) { memset(guest_xsave, 0, sizeof(struct kvm_xsave)); fill_xsave((u8 *) guest_xsave->region, vcpu); } else { memcpy(guest_xsave->region, &vcpu->arch.guest_fpu->state.fxsave, sizeof(struct fxregs_state)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = XFEATURE_MASK_FPSSE; } } #define XSAVE_MXCSR_OFFSET 24 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { u64 xstate_bv; u32 mxcsr; if (!vcpu->arch.guest_fpu) return 0; xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; if (boot_cpu_has(X86_FEATURE_XSAVE)) { /* * Here we allow setting states that are not present in * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { if (xstate_bv & ~XFEATURE_MASK_FPSSE || mxcsr & ~mxcsr_feature_mask) return -EINVAL; memcpy(&vcpu->arch.guest_fpu->state.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); } return 0; } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, struct kvm_xcrs *guest_xcrs) { if (!boot_cpu_has(X86_FEATURE_XSAVE)) { guest_xcrs->nr_xcrs = 0; return; } guest_xcrs->nr_xcrs = 1; guest_xcrs->flags = 0; guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; } static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, struct kvm_xcrs *guest_xcrs) { int i, r = 0; if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -EINVAL; if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) return -EINVAL; for (i = 0; i < guest_xcrs->nr_xcrs; i++) /* Only support XCR0 currently */ if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) { r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, guest_xcrs->xcrs[i].value); break; } if (r) r = -EINVAL; return r; } /* * kvm_set_guest_paused() indicates to the guest kernel that it has been * stopped by the hypervisor. This function will be called from the host only. * EINVAL is returned when the host attempts to set the flag for a guest that * does not support pv clocks. */ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) { if (!vcpu->arch.pv_time_enabled) return -EINVAL; vcpu->arch.pvclock_set_guest_stopped_request = true; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); return 0; } static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { int r; uint16_t vmcs_version; void __user *user_ptr; if (cap->flags) return -EINVAL; switch (cap->cap) { case KVM_CAP_HYPERV_SYNIC2: if (cap->args[0]) return -EINVAL; fallthrough; case KVM_CAP_HYPERV_SYNIC: if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: if (!kvm_x86_ops.nested_ops->enable_evmcs) return -ENOTTY; r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); if (!r) { user_ptr = (void __user *)(uintptr_t)cap->args[0]; if (copy_to_user(user_ptr, &vmcs_version, sizeof(vmcs_version))) r = -EFAULT; } return r; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: if (!kvm_x86_ops.enable_direct_tlbflush) return -ENOTTY; return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); case KVM_CAP_HYPERV_ENFORCE_CPUID: return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; if (vcpu->arch.pv_cpuid.enforce) kvm_update_pv_runtime(vcpu); return 0; default: return -EINVAL; } } long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; union { struct kvm_sregs2 *sregs2; struct kvm_lapic_state *lapic; struct kvm_xsave *xsave; struct kvm_xcrs *xcrs; void *buffer; } u; vcpu_load(vcpu); u.buffer = NULL; switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.lapic) goto out; r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; } case KVM_SET_LAPIC: { r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); if (IS_ERR(u.lapic)) { r = PTR_ERR(u.lapic); goto out_nofree; } r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); break; } case KVM_INTERRUPT: { struct kvm_interrupt irq; r = -EFAULT; if (copy_from_user(&irq, argp, sizeof(irq))) goto out; r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); break; } case KVM_NMI: { r = kvm_vcpu_ioctl_nmi(vcpu); break; } case KVM_SMI: { r = kvm_vcpu_ioctl_smi(vcpu); break; } case KVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); break; } case KVM_SET_CPUID2: { struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, cpuid_arg->entries); break; } case KVM_GET_CPUID2: { struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, cpuid_arg->entries); if (r) goto out; r = -EFAULT; if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) goto out; r = 0; break; } case KVM_GET_MSRS: { int idx = srcu_read_lock(&vcpu->kvm->srcu); r = msr_io(vcpu, argp, do_get_msr, 1); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_SET_MSRS: { int idx = srcu_read_lock(&vcpu->kvm->srcu); r = msr_io(vcpu, argp, do_set_msr, 0); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_TPR_ACCESS_REPORTING: { struct kvm_tpr_access_ctl tac; r = -EFAULT; if (copy_from_user(&tac, argp, sizeof(tac))) goto out; r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &tac, sizeof(tac))) goto out; r = 0; break; }; case KVM_SET_VAPIC_ADDR: { struct kvm_vapic_addr va; int idx; r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; r = -EFAULT; if (copy_from_user(&va, argp, sizeof(va))) goto out; idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_X86_SETUP_MCE: { u64 mcg_cap; r = -EFAULT; if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap))) goto out; r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); break; } case KVM_X86_SET_MCE: { struct kvm_x86_mce mce; r = -EFAULT; if (copy_from_user(&mce, argp, sizeof(mce))) goto out; r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); break; } case KVM_GET_VCPU_EVENTS: { struct kvm_vcpu_events events; kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); r = -EFAULT; if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) break; r = 0; break; } case KVM_SET_VCPU_EVENTS: { struct kvm_vcpu_events events; r = -EFAULT; if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) break; vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); break; } case KVM_GET_DEBUGREGS: { struct kvm_debugregs dbgregs; kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); r = -EFAULT; if (copy_to_user(argp, &dbgregs, sizeof(struct kvm_debugregs))) break; r = 0; break; } case KVM_SET_DEBUGREGS: { struct kvm_debugregs dbgregs; r = -EFAULT; if (copy_from_user(&dbgregs, argp, sizeof(struct kvm_debugregs))) break; r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); break; } case KVM_GET_XSAVE: { u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xsave) break; kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); r = -EFAULT; if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) break; r = 0; break; } case KVM_SET_XSAVE: { u.xsave = memdup_user(argp, sizeof(*u.xsave)); if (IS_ERR(u.xsave)) { r = PTR_ERR(u.xsave); goto out_nofree; } r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } case KVM_GET_XCRS: { u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xcrs) break; kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); r = -EFAULT; if (copy_to_user(argp, u.xcrs, sizeof(struct kvm_xcrs))) break; r = 0; break; } case KVM_SET_XCRS: { u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); if (IS_ERR(u.xcrs)) { r = PTR_ERR(u.xcrs); goto out_nofree; } r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; } case KVM_SET_TSC_KHZ: { u32 user_tsc_khz; r = -EINVAL; user_tsc_khz = (u32)arg; if (kvm_has_tsc_control && user_tsc_khz >= kvm_max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) user_tsc_khz = tsc_khz; if (!kvm_set_tsc_khz(vcpu, user_tsc_khz)) r = 0; goto out; } case KVM_GET_TSC_KHZ: { r = vcpu->arch.virtual_tsc_khz; goto out; } case KVM_KVMCLOCK_CTRL: { r = kvm_set_guest_paused(vcpu); goto out; } case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; r = -EFAULT; if (copy_from_user(&cap, argp, sizeof(cap))) goto out; r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } case KVM_GET_NESTED_STATE: { struct kvm_nested_state __user *user_kvm_nested_state = argp; u32 user_data_size; r = -EINVAL; if (!kvm_x86_ops.nested_ops->get_state) break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); r = -EFAULT; if (get_user(user_data_size, &user_kvm_nested_state->size)) break; r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state, user_data_size); if (r < 0) break; if (r > user_data_size) { if (put_user(r, &user_kvm_nested_state->size)) r = -EFAULT; else r = -E2BIG; break; } r = 0; break; } case KVM_SET_NESTED_STATE: { struct kvm_nested_state __user *user_kvm_nested_state = argp; struct kvm_nested_state kvm_state; int idx; r = -EINVAL; if (!kvm_x86_ops.nested_ops->set_state) break; r = -EFAULT; if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) break; r = -EINVAL; if (kvm_state.size < sizeof(kvm_state)) break; if (kvm_state.flags & ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING | KVM_STATE_NESTED_GIF_SET)) break; /* nested_run_pending implies guest_mode. */ if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING) && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE)) break; idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp); break; #ifdef CONFIG_KVM_XEN case KVM_XEN_VCPU_GET_ATTR: { struct kvm_xen_vcpu_attr xva; r = -EFAULT; if (copy_from_user(&xva, argp, sizeof(xva))) goto out; r = kvm_xen_vcpu_get_attr(vcpu, &xva); if (!r && copy_to_user(argp, &xva, sizeof(xva))) r = -EFAULT; break; } case KVM_XEN_VCPU_SET_ATTR: { struct kvm_xen_vcpu_attr xva; r = -EFAULT; if (copy_from_user(&xva, argp, sizeof(xva))) goto out; r = kvm_xen_vcpu_set_attr(vcpu, &xva); break; } #endif case KVM_GET_SREGS2: { u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL); r = -ENOMEM; if (!u.sregs2) goto out; __get_sregs2(vcpu, u.sregs2); r = -EFAULT; if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2))) goto out; r = 0; break; } case KVM_SET_SREGS2: { u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2)); if (IS_ERR(u.sregs2)) { r = PTR_ERR(u.sregs2); u.sregs2 = NULL; goto out; } r = __set_sregs2(vcpu, u.sregs2); break; } default: r = -EINVAL; } out: kfree(u.buffer); out_nofree: vcpu_put(vcpu); return r; } vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) { int ret; if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; ret = static_call(kvm_x86_set_tss_addr)(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages) { if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; mutex_lock(&kvm->slots_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; mutex_unlock(&kvm->slots_lock); return 0; } static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { return kvm->arch.n_max_mmu_pages; } static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { struct kvm_pic *pic = kvm->arch.vpic; int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: memcpy(&chip->chip.pic, &pic->pics[0], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_PIC_SLAVE: memcpy(&chip->chip.pic, &pic->pics[1], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; break; } return r; } static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { struct kvm_pic *pic = kvm->arch.vpic; int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: spin_lock(&pic->lock); memcpy(&pic->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); spin_unlock(&pic->lock); break; case KVM_IRQCHIP_PIC_SLAVE: spin_lock(&pic->lock); memcpy(&pic->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); spin_unlock(&pic->lock); break; case KVM_IRQCHIP_IOAPIC: kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; break; } kvm_pic_update_irq(pic); return r; } static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); mutex_lock(&kps->lock); memcpy(ps, &kps->channels, sizeof(*ps)); mutex_unlock(&kps->lock); return 0; } static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { int i; struct kvm_pit *pit = kvm->arch.vpit; mutex_lock(&pit->pit_state.lock); memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); for (i = 0; i < 3; i++) kvm_pit_load_count(pit, i, ps->channels[i].count, 0); mutex_unlock(&pit->pit_state.lock); return 0; } static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); memset(&ps->reserved, 0, sizeof(ps->reserved)); return 0; } static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) { int start = 0; int i; u32 prev_legacy, cur_legacy; struct kvm_pit *pit = kvm->arch.vpit; mutex_lock(&pit->pit_state.lock); prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; if (!prev_legacy && cur_legacy) start = 1; memcpy(&pit->pit_state.channels, &ps->channels, sizeof(pit->pit_state.channels)); pit->pit_state.flags = ps->flags; for (i = 0; i < 3; i++) kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, start && i == 0); mutex_unlock(&pit->pit_state.lock); return 0; } static int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control) { struct kvm_pit *pit = kvm->arch.vpit; /* pit->pit_state.lock was overloaded to prevent userspace from getting * an inconsistent state after running multiple KVM_REINJECT_CONTROL * ioctls in parallel. Use a separate lock if that ioctl isn't rare. */ mutex_lock(&pit->pit_state.lock); kvm_pit_set_reinject(pit, control->pit_reinject); mutex_unlock(&pit->pit_state.lock); return 0; } void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { /* * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called * before reporting dirty_bitmap to userspace. KVM flushes the buffers * on all VM-Exits, thus we only need to kick running vCPUs to force a * VM-Exit. */ struct kvm_vcpu *vcpu; int i; kvm_for_each_vcpu(i, vcpu, kvm) kvm_vcpu_kick(vcpu); } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, bool line_status) { if (!irqchip_in_kernel(kvm)) return -ENXIO; irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event->irq, irq_event->level, line_status); return 0; } int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; if (cap->flags) return -EINVAL; switch (cap->cap) { case KVM_CAP_DISABLE_QUIRKS: kvm->arch.disabled_quirks = cap->args[0]; r = 0; break; case KVM_CAP_SPLIT_IRQCHIP: { mutex_lock(&kvm->lock); r = -EINVAL; if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) goto split_irqchip_unlock; r = -EEXIST; if (irqchip_in_kernel(kvm)) goto split_irqchip_unlock; if (kvm->created_vcpus) goto split_irqchip_unlock; r = kvm_setup_empty_irq_routing(kvm); if (r) goto split_irqchip_unlock; /* Pairs with irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; r = 0; split_irqchip_unlock: mutex_unlock(&kvm->lock); break; } case KVM_CAP_X2APIC_API: r = -EINVAL; if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) break; if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) kvm->arch.x2apic_format = true; if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) kvm->arch.x2apic_broadcast_quirk_disabled = true; r = 0; break; case KVM_CAP_X86_DISABLE_EXITS: r = -EINVAL; if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) break; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; #define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \ "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests." if (!mitigate_smt_rsb) { if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() && (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) pr_warn_once(SMT_RSB_MSG); if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && kvm_can_mwait_in_guest()) kvm->arch.mwait_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) kvm->arch.cstate_in_guest = true; } r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; r = 0; break; case KVM_CAP_EXCEPTION_PAYLOAD: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; case KVM_CAP_X86_USER_SPACE_MSR: r = -EINVAL; if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | KVM_MSR_EXIT_REASON_UNKNOWN | KVM_MSR_EXIT_REASON_FILTER)) break; kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; break; case KVM_CAP_X86_BUS_LOCK_EXIT: r = -EINVAL; if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE) break; if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) && (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) break; if (kvm_has_bus_lock_exit && cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) kvm->arch.bus_lock_detection_enabled = true; r = 0; break; #ifdef CONFIG_X86_SGX_KVM case KVM_CAP_SGX_ATTRIBUTE: { unsigned long allowed_attributes = 0; r = sgx_set_attribute(&allowed_attributes, cap->args[0]); if (r) break; /* KVM only supports the PROVISIONKEY privileged attribute. */ if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) && !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY)) kvm->arch.sgx_provisioning_allowed = true; else r = -EINVAL; break; } #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: r = -EINVAL; if (kvm_x86_ops.vm_copy_enc_context_from) r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); return r; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { r = -EINVAL; break; } kvm->arch.hypercall_exit_enabled = cap->args[0]; r = 0; break; case KVM_CAP_EXIT_ON_EMULATION_FAILURE: r = -EINVAL; if (cap->args[0] & ~1) break; kvm->arch.exit_on_emulation_error = cap->args[0]; r = 0; break; default: r = -EINVAL; break; } return r; } static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) { struct kvm_x86_msr_filter *msr_filter; msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); if (!msr_filter) return NULL; msr_filter->default_allow = default_allow; return msr_filter; } static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) { u32 i; if (!msr_filter) return; for (i = 0; i < msr_filter->count; i++) kfree(msr_filter->ranges[i].bitmap); kfree(msr_filter); } static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, struct kvm_msr_filter_range *user_range) { unsigned long *bitmap = NULL; size_t bitmap_size; if (!user_range->nmsrs) return 0; if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) return -EINVAL; if (!user_range->flags) return -EINVAL; bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) return -EINVAL; bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); if (IS_ERR(bitmap)) return PTR_ERR(bitmap); msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) { .flags = user_range->flags, .base = user_range->base, .nmsrs = user_range->nmsrs, .bitmap = bitmap, }; msr_filter->count++; return 0; } static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, struct kvm_msr_filter *filter) { struct kvm_x86_msr_filter *new_filter, *old_filter; bool default_allow; bool empty = true; int r = 0; u32 i; if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY) return -EINVAL; for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) empty &= !filter->ranges[i].nmsrs; default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY); if (empty && !default_allow) return -EINVAL; new_filter = kvm_alloc_msr_filter(default_allow); if (!new_filter) return -ENOMEM; for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { r = kvm_add_msr_filter(new_filter, &filter->ranges[i]); if (r) { kvm_free_msr_filter(new_filter); return r; } } mutex_lock(&kvm->lock); /* The per-VM filter is protected by kvm->lock... */ old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); rcu_assign_pointer(kvm->arch.msr_filter, new_filter); synchronize_srcu(&kvm->srcu); kvm_free_msr_filter(old_filter); kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); mutex_unlock(&kvm->lock); return 0; } #ifdef CONFIG_KVM_COMPAT /* for KVM_X86_SET_MSR_FILTER */ struct kvm_msr_filter_range_compat { __u32 flags; __u32 nmsrs; __u32 base; __u32 bitmap; }; struct kvm_msr_filter_compat { __u32 flags; struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES]; }; #define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat) long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; struct kvm *kvm = filp->private_data; long r = -ENOTTY; switch (ioctl) { case KVM_X86_SET_MSR_FILTER_COMPAT: { struct kvm_msr_filter __user *user_msr_filter = argp; struct kvm_msr_filter_compat filter_compat; struct kvm_msr_filter filter; int i; if (copy_from_user(&filter_compat, user_msr_filter, sizeof(filter_compat))) return -EFAULT; filter.flags = filter_compat.flags; for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { struct kvm_msr_filter_range_compat *cr; cr = &filter_compat.ranges[i]; filter.ranges[i] = (struct kvm_msr_filter_range) { .flags = cr->flags, .nmsrs = cr->nmsrs, .base = cr->base, .bitmap = (__u8 *)(ulong)cr->bitmap, }; } r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); break; } } return r; } #endif #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_arch_suspend_notifier(struct kvm *kvm) { struct kvm_vcpu *vcpu; int i, ret = 0; mutex_lock(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) { if (!vcpu->arch.pv_time_enabled) continue; ret = kvm_set_guest_paused(vcpu); if (ret) { kvm_err("Failed to pause guest VCPU%d: %d\n", vcpu->vcpu_id, ret); break; } } mutex_unlock(&kvm->lock); return ret ? NOTIFY_BAD : NOTIFY_DONE; } int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) { switch (state) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: return kvm_arch_suspend_notifier(kvm); } return NOTIFY_DONE; } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -ENOTTY; /* * This union makes it completely explicit to gcc-3.x * that these two variables' stack usage should be * combined, not added together. */ union { struct kvm_pit_state ps; struct kvm_pit_state2 ps2; struct kvm_pit_config pit_config; } u; switch (ioctl) { case KVM_SET_TSS_ADDR: r = kvm_vm_ioctl_set_tss_addr(kvm, arg); break; case KVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; mutex_lock(&kvm->lock); r = -EINVAL; if (kvm->created_vcpus) goto set_identity_unlock; r = -EFAULT; if (copy_from_user(&ident_addr, argp, sizeof(ident_addr))) goto set_identity_unlock; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); set_identity_unlock: mutex_unlock(&kvm->lock); break; } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); break; case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; case KVM_CREATE_IRQCHIP: { mutex_lock(&kvm->lock); r = -EEXIST; if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; r = kvm_pic_init(kvm); if (r) goto create_irqchip_unlock; r = kvm_ioapic_init(kvm); if (r) { kvm_pic_destroy(kvm); goto create_irqchip_unlock; } r = kvm_setup_default_irq_routing(kvm); if (r) { kvm_ioapic_destroy(kvm); kvm_pic_destroy(kvm); goto create_irqchip_unlock; } /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; create_irqchip_unlock: mutex_unlock(&kvm->lock); break; } case KVM_CREATE_PIT: u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; goto create_pit; case KVM_CREATE_PIT2: r = -EFAULT; if (copy_from_user(&u.pit_config, argp, sizeof(struct kvm_pit_config))) goto out; create_pit: mutex_lock(&kvm->lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; r = -ENOMEM; kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); if (kvm->arch.vpit) r = 0; create_pit_unlock: mutex_unlock(&kvm->lock); break; case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip *chip; chip = memdup_user(argp, sizeof(*chip)); if (IS_ERR(chip)) { r = PTR_ERR(chip); goto out; } r = -ENXIO; if (!irqchip_kernel(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) goto get_irqchip_out; r = -EFAULT; if (copy_to_user(argp, chip, sizeof(*chip))) goto get_irqchip_out; r = 0; get_irqchip_out: kfree(chip); break; } case KVM_SET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip *chip; chip = memdup_user(argp, sizeof(*chip)); if (IS_ERR(chip)) { r = PTR_ERR(chip); goto out; } r = -ENXIO; if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); set_irqchip_out: kfree(chip); break; } case KVM_GET_PIT: { r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) goto out; r = -ENXIO; if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_get_pit(kvm, &u.ps); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) goto out; r = 0; break; } case KVM_SET_PIT: { r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof(u.ps))) goto out; mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) goto set_pit_out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); set_pit_out: mutex_unlock(&kvm->lock); break; } case KVM_GET_PIT2: { r = -ENXIO; if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); if (r) goto out; r = -EFAULT; if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) goto out; r = 0; break; } case KVM_SET_PIT2: { r = -EFAULT; if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) goto out; mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) goto set_pit2_out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); set_pit2_out: mutex_unlock(&kvm->lock); break; } case KVM_REINJECT_CONTROL: { struct kvm_reinject_control control; r = -EFAULT; if (copy_from_user(&control, argp, sizeof(control))) goto out; r = -ENXIO; if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_reinject(kvm, &control); break; } case KVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); if (kvm->created_vcpus) r = -EBUSY; else kvm->arch.bsp_vcpu_id = arg; mutex_unlock(&kvm->lock); break; #ifdef CONFIG_KVM_XEN case KVM_XEN_HVM_CONFIG: { struct kvm_xen_hvm_config xhc; r = -EFAULT; if (copy_from_user(&xhc, argp, sizeof(xhc))) goto out; r = kvm_xen_hvm_config(kvm, &xhc); break; } case KVM_XEN_HVM_GET_ATTR: { struct kvm_xen_hvm_attr xha; r = -EFAULT; if (copy_from_user(&xha, argp, sizeof(xha))) goto out; r = kvm_xen_hvm_get_attr(kvm, &xha); if (!r && copy_to_user(argp, &xha, sizeof(xha))) r = -EFAULT; break; } case KVM_XEN_HVM_SET_ATTR: { struct kvm_xen_hvm_attr xha; r = -EFAULT; if (copy_from_user(&xha, argp, sizeof(xha))) goto out; r = kvm_xen_hvm_set_attr(kvm, &xha); break; } #endif case KVM_SET_CLOCK: { struct kvm_arch *ka = &kvm->arch; struct kvm_clock_data user_ns; u64 now_ns; r = -EFAULT; if (copy_from_user(&user_ns, argp, sizeof(user_ns))) goto out; r = -EINVAL; if (user_ns.flags) goto out; r = 0; /* * TODO: userspace has to take care of races with VCPU_RUN, so * kvm_gen_update_masterclock() can be cut down to locked * pvclock_update_vm_gtod_copy(). */ kvm_gen_update_masterclock(kvm); /* * This pairs with kvm_guest_time_update(): when masterclock is * in use, we use master_kernel_ns + kvmclock_offset to set * unsigned 'system_time' so if we use get_kvmclock_ns() (which * is slightly ahead) here we risk going negative on unsigned * 'system_time' when 'user_ns.clock' is very small. */ raw_spin_lock_irq(&ka->pvclock_gtod_sync_lock); if (kvm->arch.use_master_clock) now_ns = ka->master_kernel_ns; else now_ns = get_kvmclock_base_ns(); ka->kvmclock_offset = user_ns.clock - now_ns; raw_spin_unlock_irq(&ka->pvclock_gtod_sync_lock); kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; } case KVM_GET_CLOCK: { struct kvm_clock_data user_ns; u64 now_ns; now_ns = get_kvmclock_ns(kvm); user_ns.clock = now_ns; user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; memset(&user_ns.pad, 0, sizeof(user_ns.pad)); r = -EFAULT; if (copy_to_user(argp, &user_ns, sizeof(user_ns))) goto out; r = 0; break; } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (kvm_x86_ops.mem_enc_op) r = static_call(kvm_x86_mem_enc_op)(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { struct kvm_enc_region region; r = -EFAULT; if (copy_from_user(&region, argp, sizeof(region))) goto out; r = -ENOTTY; if (kvm_x86_ops.mem_enc_reg_region) r = static_call(kvm_x86_mem_enc_reg_region)(kvm, &region); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { struct kvm_enc_region region; r = -EFAULT; if (copy_from_user(&region, argp, sizeof(region))) goto out; r = -ENOTTY; if (kvm_x86_ops.mem_enc_unreg_region) r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, &region); break; } case KVM_HYPERV_EVENTFD: { struct kvm_hyperv_eventfd hvevfd; r = -EFAULT; if (copy_from_user(&hvevfd, argp, sizeof(hvevfd))) goto out; r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); break; } case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; case KVM_X86_SET_MSR_FILTER: { struct kvm_msr_filter __user *user_msr_filter = argp; struct kvm_msr_filter filter; if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) return -EFAULT; r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); break; } default: r = -ENOTTY; } out: return r; } static void kvm_init_msr_list(void) { struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i; BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, "Please update the fixed PMCs in msrs_to_saved_all[]"); perf_get_x86_pmu_capability(&x86_pmu); num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) continue; /* * Even MSRs that are valid in the host may not be exposed * to the guests in some cases. */ switch (msrs_to_save_all[i]) { case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) continue; break; case MSR_TSC_AUX: if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && !kvm_cpu_cap_has(X86_FEATURE_RDPID)) continue; break; case MSR_IA32_UMWAIT_CONTROL: if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) continue; break; case MSR_IA32_RTIT_CTL: case MSR_IA32_RTIT_STATUS: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) continue; break; case MSR_IA32_RTIT_CR3_MATCH: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) continue; break; case MSR_IA32_RTIT_OUTPUT_BASE: case MSR_IA32_RTIT_OUTPUT_MASK: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) continue; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 7: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 7: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; break; default: break; } msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; } for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { struct kvm_msr_entry msr; msr.index = msr_based_features_all[i]; if (kvm_get_msr_feature(&msr)) continue; msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; } } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v) { int handled = 0; int n; do { n = min(len, 8); if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v)) && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v)) break; handled += n; addr += n; len -= n; v += n; } while (len); return handled; } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) { int handled = 0; int n; do { n = min(len, 8); if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, addr, n, v)) && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) break; trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v); handled += n; addr += n; len -= n; v += n; } while (len); return handled; } static void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { static_call(kvm_x86_set_segment)(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { static_call(kvm_x86_get_segment)(vcpu, var, seg); } gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { gpa_t t_gpa; BUG_ON(!mmu_is_nested(vcpu)); /* NPT walks are always user-walks */ access |= PFERR_USER_MASK; t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception); return t_gpa; } gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u32 access, struct x86_exception *exception) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, offset, toread); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; } bytes -= toread; data += toread; addr += toread; } out: return r; } /* used for instruction fetching */ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; /* Inline kvm_read_guest_virt_helper for speed. */ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, exception); if (unlikely(gpa == UNMAPPED_GVA)) return X86EMUL_PROPAGATE_FAULT; offset = addr & (PAGE_SIZE-1); if (WARN_ON(offset + bytes > PAGE_SIZE)) bytes = (unsigned)PAGE_SIZE - offset; ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val, offset, bytes); if (unlikely(ret < 0)) return X86EMUL_IO_NEEDED; return X86EMUL_CONTINUE; } int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED * is returned, but our callers are not ready for that and they blindly * call kvm_inject_page_fault. Ensure that they at least do not leak * uninitialized kernel stack memory into cr2 and error code. */ memset(exception, 0, sizeof(*exception));