Total coverage: 445337 (23%)of 1987316
26713 26718 17155 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/init.h> #include <linux/export.h> #include <linux/timer.h> #include <linux/acpi_pmtmr.h> #include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/clocksource.h> #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> #include <linux/static_call.h> #include <asm/hpet.h> #include <asm/timer.h> #include <asm/vgtod.h> #include <asm/time.h> #include <asm/delay.h> #include <asm/hypervisor.h> #include <asm/nmi.h> #include <asm/x86_init.h> #include <asm/geode.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> #include <asm/i8259.h> #include <asm/topology.h> #include <asm/uv/uv.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); #define KHZ 1000 /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; static unsigned int __initdata tsc_early_khz; static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc); int tsc_clocksource_reliable; static int __read_mostly tsc_force_recalibrate; static struct clocksource_base art_base_clk = { .id = CSID_X86_ART, }; static bool have_art; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ seqcount_latch_t seq; /* 32 + 4 = 36 */ }; /* fits one cacheline */ static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); static int __init tsc_early_khz_setup(char *buf) { return kstrtouint(buf, 0, &tsc_early_khz); } early_param("tsc_early_khz", tsc_early_khz_setup); __always_inline void __cyc2ns_read(struct cyc2ns_data *data) { int seq, idx; do { seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { preempt_disable_notrace(); __cyc2ns_read(data); } __always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); } /* * Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) * ns = cycles * (ns_per_sec / freq) * ns = cycles * (10^9 / (cpu_khz * 10^3)) * ns = cycles * (10^6 / cpu_khz) * * Then we use scaling math (suggested by george@mvista.com) to get: * ns = cycles * (10^6 * SC / cpu_khz) / SC * ns = cycles * cyc2ns_scale / SC * * And since SC is a constant power of two, we can convert the div * into a shift. The larger SC is, the more accurate the conversion, but * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication * (64-bit result) can be used. * * We can use khz divisor instead of mhz to keep a better precision. * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns; __cyc2ns_read(&data); ns = data.cyc2ns_offset; ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); return ns; } static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) { unsigned long long ns; preempt_disable_notrace(); ns = __cycles_2_ns(cyc); preempt_enable_notrace(); return ns; } static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; ns_now = cycles_2_ns(tsc_now); /* * Compute a new multiplier as per the above comment and ensure our * time function is continuous; see the comment near struct * cyc2ns_data. */ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* * cyc2ns_shift is exported via arch_perf_update_userpage() where it is * not expected to be greater than 31 due to the original published * conversion algorithm shifting a 32-bit value (now specifies a 64-bit * value) - refer perf_event_mmap_page documentation in perf_event.h. */ if (data.cyc2ns_shift == 32) { data.cyc2ns_shift = 31; data.cyc2ns_mul >>= 1; } data.cyc2ns_offset = ns_now - mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); c2n = per_cpu_ptr(&cyc2ns, cpu); write_seqcount_latch_begin(&c2n->seq); c2n->data[0] = data; write_seqcount_latch(&c2n->seq); c2n->data[1] = data; write_seqcount_latch_end(&c2n->seq); } static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long flags; local_irq_save(flags); sched_clock_idle_sleep_event(); if (khz) __set_cyc2ns_scale(khz, cpu, tsc_now); sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* * Initialize cyc2ns for boot cpu */ static void __init cyc2ns_init_boot_cpu(void) { struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); seqcount_latch_init(&c2n->seq); __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); } /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same * speed as the bootup CPU. */ static void __init cyc2ns_init_secondary_cpus(void) { unsigned int cpu, this_cpu = smp_processor_id(); struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); struct cyc2ns_data *data = c2n->data; for_each_possible_cpu(cpu) { if (cpu != this_cpu) { seqcount_latch_init(&c2n->seq); c2n = per_cpu_ptr(&cyc2ns, cpu); c2n->data[0] = data[0]; c2n->data[1] = data[1]; } } } /* * Scheduler clock - returns current time in nanosec units. */ noinstr u64 native_sched_clock(void) { if (static_branch_likely(&__use_tsc)) { u64 tsc_now = rdtsc(); /* return the value in ns */ return __cycles_2_ns(tsc_now); } /* * Fall back to jiffies if there's no TSC available: * ( But note that we still use it if the TSC is marked * unstable. We do this because unlike Time Of Day, * the scheduler clock tolerates small errors and it's * very important for it to be as fast as the platform * can achieve it. ) */ /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); } /* * Generate a sched_clock if you already have a TSC value. */ u64 native_sched_clock_from_tsc(u64 tsc) { return cycles_2_ns(tsc); } /* We need to define a real function for sched_clock, to override the weak default version */ #ifdef CONFIG_PARAVIRT noinstr u64 sched_clock_noinstr(void) { return paravirt_sched_clock(); } bool using_native_sched_clock(void) { return static_call_query(pv_sched_clock) == native_sched_clock; } #else u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock"))); bool using_native_sched_clock(void) { return true; } #endif notrace u64 sched_clock(void) { u64 now; preempt_disable_notrace(); now = sched_clock_noinstr(); preempt_enable_notrace(); return now; } int check_tsc_unstable(void) { return tsc_unstable; } EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { mark_tsc_unstable("boot parameter notsc"); return 1; } #else /* * disable flag for tsc. Takes effect by clearing the TSC cpu flag * in cpu/common.c */ int __init notsc_setup(char *str) { setup_clear_cpu_cap(X86_FEATURE_TSC); return 1; } #endif __setup("notsc", notsc_setup); static int no_sched_irq_time; static int no_tsc_watchdog; static int tsc_as_watchdog; static int __init tsc_setup(char *str) { if (!strcmp(str, "reliable")) tsc_clocksource_reliable = 1; if (!strncmp(str, "noirqtime", 9)) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); if (!strcmp(str, "nowatchdog")) { no_tsc_watchdog = 1; if (tsc_as_watchdog) pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n", __func__); tsc_as_watchdog = 0; } if (!strcmp(str, "recalibrate")) tsc_force_recalibrate = 1; if (!strcmp(str, "watchdog")) { if (no_tsc_watchdog) pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n", __func__); else tsc_as_watchdog = 1; } return 1; } __setup("tsc=", tsc_setup); #define MAX_RETRIES 5 #define TSC_DEFAULT_THRESHOLD 0x20000 /* * Read TSC and the reference counters. Take care of any disturbances */ static u64 tsc_read_refs(u64 *p, int hpet) { u64 t1, t2; u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD; int i; for (i = 0; i < MAX_RETRIES; i++) { t1 = get_cycles(); if (hpet) *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else *p = acpi_pm_read_early(); t2 = get_cycles(); if ((t2 - t1) < thresh) return t2; } return ULLONG_MAX; } /* * Calculate the TSC frequency from HPET reference */ static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) { u64 tmp; if (hpet2 < hpet1) hpet2 += 0x100000000ULL; hpet2 -= hpet1; tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); do_div(tmp, 1000000); deltatsc = div64_u64(deltatsc, tmp); return (unsigned long) deltatsc; } /* * Calculate the TSC frequency from PMTimer reference */ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) { u64 tmp; if (!pm1 && !pm2) return ULONG_MAX; if (pm2 < pm1) pm2 += (u64)ACPI_PM_OVRRUN; pm2 -= pm1; tmp = pm2 * 1000000000LL; do_div(tmp, PMTMR_TICKS_PER_SEC); do_div(deltatsc, tmp); return (unsigned long) deltatsc; } #define CAL_MS 10 #define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) #define CAL_PIT_LOOPS 1000 #define CAL2_MS 50 #define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) #define CAL2_PIT_LOOPS 5000 /* * Try to calibrate the TSC against the Programmable * Interrupt Timer and return the frequency of the TSC * in kHz. * * Return ULONG_MAX on failure to calibrate. */ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) { u64 tsc, t1, t2, delta; unsigned long tscmin, tscmax; int pitcnt; if (!has_legacy_pic()) { /* * Relies on tsc_early_delay_calibrate() to have given us semi * usable udelay(), wait for the same 50ms we would have with * the PIT loop below. */ udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); udelay(10 * USEC_PER_MSEC); return ULONG_MAX; } /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Setup CTC channel 2* for mode 0, (interrupt on terminal * count mode), binary count. Set the latch register to 50ms * (LSB then MSB) to begin countdown. */ outb(0xb0, 0x43); outb(latch & 0xff, 0x42); outb(latch >> 8, 0x42); tsc = t1 = t2 = get_cycles(); pitcnt = 0; tscmax = 0; tscmin = ULONG_MAX; while ((inb(0x61) & 0x20) == 0) { t2 = get_cycles(); delta = t2 - tsc; tsc = t2; if ((unsigned long) delta < tscmin) tscmin = (unsigned int) delta; if ((unsigned long) delta > tscmax) tscmax = (unsigned int) delta; pitcnt++; } /* * Sanity checks: * * If we were not able to read the PIT more than loopmin * times, then we have been hit by a massive SMI * * If the maximum is 10 times larger than the minimum, * then we got hit by an SMI as well. */ if (pitcnt < loopmin || tscmax > 10 * tscmin) return ULONG_MAX; /* Calculate the PIT value */ delta = t2 - t1; do_div(delta, ms); return delta; } /* * This reads the current MSB of the PIT counter, and * checks if we are running on sufficiently fast and * non-virtualized hardware. * * Our expectations are: * * - the PIT is running at roughly 1.19MHz * * - each IO is going to take about 1us on real hardware, * but we allow it to be much faster (by a factor of 10) or * _slightly_ slower (ie we allow up to a 2us read+counter * update - anything else implies a unacceptably slow CPU * or PIT for the fast calibration to work. * * - with 256 PIT ticks to read the value, we have 214us to * see the same MSB (and overhead like doing a single TSC * read per MSB value etc). * * - We're doing 2 reads per loop (LSB, MSB), and we expect * them each to take about a microsecond on real hardware. * So we expect a count value of around 100. But we'll be * generous, and accept anything over 50. * * - if the PIT is stuck, and we see *many* more reads, we * return early (and the next caller of pit_expect_msb() * then consider it a failure when they don't see the * next expected value). * * These expectations mean that we know that we have seen the * transition from one expected value to another with a fairly * high accuracy, and we didn't miss any events. We can thus * use the TSC value at the transitions to calculate a pretty * good value for the TSC frequency. */ static inline int pit_verify_msb(unsigned char val) { /* Ignore LSB */ inb(0x42); return inb(0x42) == val; } static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; u64 tsc = 0, prev_tsc = 0; for (count = 0; count < 50000; count++) { if (!pit_verify_msb(val)) break; prev_tsc = tsc; tsc = get_cycles(); } *deltap = get_cycles() - prev_tsc; *tscp = tsc; /* * We require _some_ success, but the quality control * will be based on the error terms on the TSC values. */ return count > 5; } /* * How many MSB values do we want to see? We aim for * a maximum error rate of 500ppm (in practice the * real error is much smaller), but refuse to spend * more than 50ms on it. */ #define MAX_QUICK_PIT_MS 50 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) { int i; u64 tsc, delta; unsigned long d1, d2; if (!has_legacy_pic()) return 0; /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); /* * Counter 2, mode 0 (one-shot), binary count * * NOTE! Mode 2 decrements by two (and then the * output is flipped each time, giving the same * final output frequency as a decrement-by-one), * so mode 0 is much better when looking at the * individual counts. */ outb(0xb0, 0x43); /* Start at 0xffff */ outb(0xff, 0x42); outb(0xff, 0x42); /* * The PIT starts counting at the next edge, so we * need to delay for a microsecond. The easiest way * to do that is to just read back the 16-bit counter * once from the PIT. */ pit_verify_msb(0); if (pit_expect_msb(0xff, &tsc, &d1)) { for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { if (!pit_expect_msb(0xff-i, &delta, &d2)) break; delta -= tsc; /* * Extrapolate the error and fail fast if the error will * never be below 500 ppm. */ if (i == 1 && d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11) return 0; /* * Iterate until the error is less than 500 ppm */ if (d1+d2 >= delta >> 11) continue; /* * Check the PIT one more time to verify that * all TSC reads were stable wrt the PIT. * * This also guarantees serialization of the * last cycle read ('d2') in pit_expect_msb. */ if (!pit_verify_msb(0xfe - i)) break; goto success; } } pr_info("Fast TSC calibration failed\n"); return 0; success: /* * Ok, if we get here, then we've seen the * MSB of the PIT decrement 'i' times, and the * error has shrunk to less than 500 ppm. * * As a result, we can depend on there not being * any odd delays anywhere, and the TSC reads are * reliable (within the error). * * kHz = ticks / time-in-seconds / 1000; * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) */ delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); pr_info("Fast TSC calibration using PIT\n"); return delta; } /** * native_calibrate_tsc - determine TSC frequency * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) { unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; unsigned int crystal_khz; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; if (boot_cpu_data.cpuid_level < 0x15) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; crystal_khz = ecx_hz / 1000; /* * Denverton SoCs don't report crystal clock, and also don't support * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal * clock. */ if (crystal_khz == 0 && boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D) crystal_khz = 25000; /* * TSC frequency reported directly by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This * is considered a known frequency. */ if (crystal_khz != 0) setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); /* * Some Intel SoCs like Skylake and Kabylake don't report the crystal * clock, but we can easily calculate it to a high degree of accuracy * by considering the crystal ratio and the CPU speed. */ if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) { unsigned int eax_base_mhz, ebx, ecx, edx; cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx); crystal_khz = eax_base_mhz * 1000 * eax_denominator / ebx_numerator; } if (crystal_khz == 0) return 0; /* * For Atom SoCs TSC is the only reliable clocksource. * Mark TSC reliable so no watchdog on it. */ if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); #ifdef CONFIG_X86_LOCAL_APIC /* * The local APIC appears to be fed by the core crystal clock * (which sounds entirely sensible). We can set the global * lapic_timer_period here to avoid having to calibrate the APIC * timer later. */ lapic_timer_period = crystal_khz * 1000 / HZ; #endif return crystal_khz * ebx_numerator / eax_denominator; } static unsigned long cpu_khz_from_cpuid(void) { unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; if (boot_cpu_data.cpuid_level < 0x16) return 0; eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); return eax_base_mhz * 1000; } /* * calibrate cpu using pit, hpet, and ptimer methods. They are available * later in boot after acpi is initialized. */ static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes * here: * * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and * load a timeout of 50ms. We read the time right after we * started the timer and wait until the PIT count down reaches * zero. In each wait loop iteration we read the TSC and check * the delta to the previous read. We keep track of the min * and max values of that delta. The delta is mostly defined * by the IO time of the PIT access, so we can detect when * any disturbance happened between the two reads. If the * maximum time is significantly larger than the minimum time, * then we discard the result and have another try. * * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. */ /* Preset PIT loop values */ latch = CAL_LATCH; ms = CAL_MS; loopmin = CAL_PIT_LOOPS; for (i = 0; i < 3; i++) { unsigned long tsc_pit_khz; /* * Read the start value and the reference count of * hpet/pmtimer when available. Then do the PIT * calibration, which will take at least 50ms, and * read the end value. */ local_irq_save(flags); tsc1 = tsc_read_refs(&ref1, hpet); tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); tsc2 = tsc_read_refs(&ref2, hpet); local_irq_restore(flags); /* Pick the lowest PIT TSC calibration so far */ tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); /* hpet or pmtimer available ? */ if (ref1 == ref2) continue; /* Check, whether the sampling was disturbed */ if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) continue; tsc2 = (tsc2 - tsc1) * 1000000LL; if (hpet) tsc2 = calc_hpet_ref(tsc2, ref1, ref2); else tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); /* Check the reference deviation */ delta = ((u64) tsc_pit_min) * 100; do_div(delta, tsc_ref_min); /* * If both calibration results are inside a 10% window * then we can be sure, that the calibration * succeeded. We break out of the loop right away. We * use the reference value, as it is more precise. */ if (delta >= 90 && delta <= 110) { pr_info("PIT calibration matches %s. %d loops\n", hpet ? "HPET" : "PMTIMER", i + 1); return tsc_ref_min; } /* * Check whether PIT failed more than once. This * happens in virtualized environments. We need to * give the virtual PC a slightly longer timeframe for * the HPET/PMTIMER to make the result precise. */ if (i == 1 && tsc_pit_min == ULONG_MAX) { latch = CAL2_LATCH; ms = CAL2_MS; loopmin = CAL2_PIT_LOOPS; } } /* * Now check the results. */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ pr_warn("Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !ref1 && !ref2) { pr_notice("No reference (HPET/PMTIMER) available\n"); return 0; } /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { pr_warn("HPET/PMTIMER calibration failed\n"); return 0; } /* Use the alternative source */ pr_info("using %s reference calibration\n", hpet ? "HPET" : "PMTIMER"); return tsc_ref_min; } /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !ref1 && !ref2) { pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); return tsc_pit_min; } /* * The calibration values differ too much. In doubt, we use * the PIT value as we know that there are PMTIMERs around * running at double speed. At least we let the user know: */ pr_warn("PIT calibration deviates from %s: %lu %lu\n", hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /** * native_calibrate_cpu_early - can calibrate the cpu early in boot */ unsigned long native_calibrate_cpu_early(void) { unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); if (!fast_calibrate) fast_calibrate = cpu_khz_from_msr(); if (!fast_calibrate) { local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); } return fast_calibrate; } /** * native_calibrate_cpu - calibrate the cpu */ static unsigned long native_calibrate_cpu(void) { unsigned long tsc_freq = native_calibrate_cpu_early(); if (!tsc_freq) tsc_freq = pit_hpet_ptimer_calibrate_cpu(); return tsc_freq; } void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; if (!boot_cpu_has(X86_FEATURE_TSC)) return; cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); #endif } EXPORT_SYMBOL_GPL(recalibrate_cpu_khz); static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { if (!sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); } /* * Even on processors with invariant TSC, TSC gets reset in some the * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to * arbitrary value (still sync'd across cpu's) during resume from such sleep * states. To cope up with this, recompute the cyc2ns_offset for each cpu so * that sched_clock() continues from the point where it was left off during * suspend. */ void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; int cpu; if (!sched_clock_stable()) return; local_irq_save(flags); /* * We're coming out of suspend, there's no concurrency yet; don't * bother being nice about the RCU stuff, just write to both * data fields. */ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); offset = cyc2ns_suspend - sched_clock(); for_each_possible_cpu(cpu) { per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; } local_irq_restore(flags); } #ifdef CONFIG_CPU_FREQ /* * Frequency scaling support. Adjust the TSC based timer when the CPU frequency * changes. * * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC * as unstable and give up in those cases. * * Should fix up last_tsc too. Currently gettimeofday in the * first tick after the change will be slightly wrong. */ static unsigned int ref_freq; static unsigned long loops_per_jiffy_ref; static unsigned long tsc_khz_ref; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; if (num_online_cpus() > 1) { mark_tsc_unstable("cpufreq changes on SMP"); return 0; } if (!ref_freq) { ref_freq = freq->old; loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy; tsc_khz_ref = tsc_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { boot_cpu_data.loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); set_cyc2ns_scale(tsc_khz, freq->policy->cpu, rdtsc()); } return 0; } static struct notifier_block time_cpufreq_notifier_block = { .notifier_call = time_cpufreq_notifier }; static int __init cpufreq_register_tsc_scaling(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return 0; if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); return 0; } core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ #define ART_CPUID_LEAF (0x15) #define ART_MIN_DENOMINATOR (1) /* * If ART is present detect the numerator:denominator to convert to TSC */ static void __init detect_art(void) { unsigned int unused; if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; /* * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required, * and the TSC counter resets must not occur asynchronously. */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || !boot_cpu_has(X86_FEATURE_TSC_ADJUST) || tsc_async_resets) return; cpuid(ART_CPUID_LEAF, &art_base_clk.denominator, &art_base_clk.numerator, &art_base_clk.freq_khz, &unused); art_base_clk.freq_khz /= KHZ; if (art_base_clk.denominator < ART_MIN_DENOMINATOR) return; rdmsrl(MSR_IA32_TSC_ADJUST, art_base_clk.offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); } /* clocksource code */ static void tsc_resume(struct clocksource *cs) { tsc_verify_tsc_adjust(true); } /* * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm * timer. * * This sanity check is now done in the core timekeeping code. * checking the result of read_tsc() - cycle_last for being negative. * That works because CLOCKSOURCE_MASK(64) does not mask out any bit. */ static u64 read_tsc(struct clocksource *cs) { return (u64)rdtsc_ordered(); } static void tsc_cs_mark_unstable(struct clocksource *cs) { if (tsc_unstable) return; tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to clocksource watchdog\n"); } static void tsc_cs_tick_stable(struct clocksource *cs) { if (tsc_unstable) return; if (using_native_sched_clock()) sched_clock_tick_stable(); } static int tsc_cs_enable(struct clocksource *cs) { vclocks_set_used(VDSO_CLOCKMODE_TSC); return 0; } /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ static struct clocksource clocksource_tsc_early = { .name = "tsc-early", .rating = 299, .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, .id = CSID_X86_TSC_EARLY, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, .list = LIST_HEAD_INIT(clocksource_tsc_early.list), }; /* * Must mark VALID_FOR_HRES early such that when we unregister tsc_early * this one will immediately take over. We will only register if TSC has * been found good. */ static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY | CLOCK_SOURCE_VERIFY_PERCPU, .id = CSID_X86_TSC, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, .list = LIST_HEAD_INIT(clocksource_tsc.list), }; void mark_tsc_unstable(char *reason) { if (tsc_unstable) return; tsc_unstable = 1; if (using_native_sched_clock()) clear_sched_clock_stable(); disable_sched_clock_irqtime(); pr_info("Marking TSC unstable due to %s\n", reason); clocksource_mark_unstable(&clocksource_tsc_early); clocksource_mark_unstable(&clocksource_tsc); } EXPORT_SYMBOL_GPL(mark_tsc_unstable); static void __init tsc_disable_clocksource_watchdog(void) { clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } bool tsc_clocksource_watchdog_disabled(void) { return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) && tsc_as_watchdog && !no_tsc_watchdog; } static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) if (is_geode_lx()) { /* RTSC counts during suspend */ #define RTSC_SUSP 0x100 unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); /* Geode_LX - the OLPC CPU has a very reliable TSC */ if (res_low & RTSC_SUSP) tsc_clocksource_reliable = 1; } #endif if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) tsc_clocksource_reliable = 1; /* * Disable the clocksource watchdog when the system has: * - TSC running at constant frequency * - TSC which does not stop in C-States * - the TSC_ADJUST register which allows to detect even minimal * modifications * - not more than four packages */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && boot_cpu_has(X86_FEATURE_TSC_ADJUST) && topology_max_packages() <= 4) tsc_disable_clocksource_watchdog(); } /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. */ int unsynchronized_tsc(void) { if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable) return 1; #ifdef CONFIG_SMP if (apic_is_clustered_box()) return 1; #endif if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; if (tsc_clocksource_reliable) return 0; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ if (topology_max_packages() > 1) return 1; } return 0; } static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** * tsc_refine_calibration_work - Further refine tsc freq calibration * @work: ignored. * * This functions uses delayed work over a period of a * second to further refine the TSC freq value. Since this is * timer based, instead of loop based, we don't block the boot * process while this longer calibration is done. * * If there are any calibration anomalies (too many SMIs, etc), * or the refined calibration is off by 1% of the fast early * calibration, we throw out the new calibration and use the * early calibration. */ static void tsc_refine_calibration_work(struct work_struct *work) { static u64 tsc_start = ULLONG_MAX, ref_start; static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; int cpu; /* Don't bother refining TSC on unstable systems */ if (tsc_unstable) goto unreg; /* * Since the work is started early in boot, we may be * delayed the first time we expire. So set the workqueue * again once we know timers are working. */ if (tsc_start == ULLONG_MAX) { restart: /* * Only set hpet once, to avoid mixing hardware * if the hpet becomes enabled later. */ hpet = is_hpet_enabled(); tsc_start = tsc_read_refs(&ref_start, hpet); schedule_delayed_work(&tsc_irqwork, HZ); return; } tsc_stop = tsc_read_refs(&ref_stop, hpet); /* hpet or pmtimer available ? */ if (ref_start == ref_stop) goto out; /* Check, whether the sampling was disturbed */ if (tsc_stop == ULLONG_MAX) goto restart; delta = tsc_stop - tsc_start; delta *= 1000000LL; if (hpet) freq = calc_hpet_ref(delta, ref_start, ref_stop); else freq = calc_pmtimer_ref(delta, ref_start, ref_stop); /* Will hit this only if tsc_force_recalibrate has been set */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { /* Warn if the deviation exceeds 500 ppm */ if (abs(tsc_khz - freq) > (tsc_khz >> 11)) { pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n"); pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n", (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); } pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n", hpet ? "HPET" : "PM_TIMER", (unsigned long)freq / 1000, (unsigned long)freq % 1000); return; } /* Make sure we're within 1% */ if (abs(tsc_khz - freq) > tsc_khz/100) goto out; tsc_khz = freq; pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); /* Update the sched_clock() rate to match the clocksource one */ for_each_possible_cpu(cpu) set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); out: if (tsc_unstable) goto unreg; if (boot_cpu_has(X86_FEATURE_ART)) { have_art = true; clocksource_tsc.base = &art_base_clk; } clocksource_register_khz(&clocksource_tsc, tsc_khz); unreg: clocksource_unregister(&clocksource_tsc_early); } static int __init init_tsc_clocksource(void) { if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) { clocksource_unregister(&clocksource_tsc_early); return 0; } if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; /* * When TSC frequency is known (retrieved via MSR or CPUID), we skip * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { if (boot_cpu_has(X86_FEATURE_ART)) { have_art = true; clocksource_tsc.base = &art_base_clk; } clocksource_register_khz(&clocksource_tsc, tsc_khz); clocksource_unregister(&clocksource_tsc_early); if (!tsc_force_recalibrate) return 0; } schedule_delayed_work(&tsc_irqwork, 0); return 0; } /* * We use device_initcall here, to ensure we run after the hpet * is fully initialized, which may occur at fs_initcall time. */ device_initcall(init_tsc_clocksource); static bool __init determine_cpu_tsc_frequencies(bool early) { /* Make sure that cpu and tsc are not already calibrated */ WARN_ON(cpu_khz || tsc_khz); if (early) { cpu_khz = x86_platform.calibrate_cpu(); if (tsc_early_khz) { tsc_khz = tsc_early_khz; } else { tsc_khz = x86_platform.calibrate_tsc(); clocksource_tsc.freq_khz = tsc_khz; } } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } /* * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ if (tsc_khz == 0) tsc_khz = cpu_khz; else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; if (tsc_khz == 0) return false; pr_info("Detected %lu.%03lu MHz processor\n", (unsigned long)cpu_khz / KHZ, (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", (unsigned long)tsc_khz / KHZ, (unsigned long)tsc_khz % KHZ); } return true; } static unsigned long __init get_loops_per_jiffy(void) { u64 lpj = (u64)tsc_khz * KHZ; do_div(lpj, HZ); return lpj; } static void __init tsc_enable_sched_clock(void) { loops_per_jiffy = get_loops_per_jiffy(); use_tsc_delay(); /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); cyc2ns_init_boot_cpu(); static_branch_enable(&__use_tsc); } void __init tsc_early_init(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return; /* Don't change UV TSC multi-chassis synchronization */ if (is_early_uv_system()) return; if (!determine_cpu_tsc_frequencies(true)) return; tsc_enable_sched_clock(); } void __init tsc_init(void) { if (!cpu_feature_enabled(X86_FEATURE_TSC)) { setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } /* * native_calibrate_cpu_early can only calibrate using methods that are * available early in boot. */ if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) x86_platform.calibrate_cpu = native_calibrate_cpu; if (!tsc_khz) { /* We failed to determine frequencies earlier, try again */ if (!determine_cpu_tsc_frequencies(false)) { mark_tsc_unstable("could not calculate TSC khz"); setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } tsc_enable_sched_clock(); } cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); lpj_fine = get_loops_per_jiffy(); check_system_tsc_reliable(); if (unsynchronized_tsc()) { mark_tsc_unstable("TSCs unsynchronized"); return; } if (tsc_clocksource_reliable || no_tsc_watchdog) tsc_disable_clocksource_watchdog(); clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } #ifdef CONFIG_SMP /* * Check whether existing calibration data can be reused. */ unsigned long calibrate_delay_is_known(void) { int sibling, cpu = smp_processor_id(); int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); /* * If TSC has constant frequency and TSC is synchronized across * sockets then reuse CPU0 calibration. */ if (constant_tsc && !tsc_unstable) return cpu_data(0).loops_per_jiffy; /* * If TSC has constant frequency and TSC is not synchronized across * sockets and this is not the first CPU in the socket, then reuse * the calibration value of an already online CPU on that socket. * * This assumes that CONSTANT_TSC is consistent for all CPUs in a * socket. */ if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); if (sibling < nr_cpu_ids) return cpu_data(sibling).loops_per_jiffy; return 0; } #endif
8 8 8 8 8 8 2 2 2 2 2 2 2 2 3 3 3 3 3 3 2 1 3 8 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 3 10 10 4 1 8 10 10 10 9 10 10 10 8 8 8 10 2 10 10 2 9 9 4 5 9 8 8 8 10 8 10 9 5 8 8 8 8 8 2 2 6 8 8 8 8 8 8 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 // SPDX-License-Identifier: GPL-2.0+ /* * usblp.c * * Copyright (c) 1999 Michael Gee <michael@linuxspecific.com> * Copyright (c) 1999 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2000 Randy Dunlap <rdunlap@xenotime.net> * Copyright (c) 2000 Vojtech Pavlik <vojtech@suse.cz> # Copyright (c) 2001 Pete Zaitcev <zaitcev@redhat.com> # Copyright (c) 2001 David Paschal <paschal@rcsis.com> * Copyright (c) 2006 Oliver Neukum <oliver@neukum.name> * * USB Printer Device Class driver for USB printers and printer cables * * Sponsored by SuSE * * ChangeLog: * v0.1 - thorough cleaning, URBification, almost a rewrite * v0.2 - some more cleanups * v0.3 - cleaner again, waitqueue fixes * v0.4 - fixes in unidirectional mode * v0.5 - add DEVICE_ID string support * v0.6 - never time out * v0.7 - fixed bulk-IN read and poll (David Paschal) * v0.8 - add devfs support * v0.9 - fix unplug-while-open paths * v0.10- remove sleep_on, fix error on oom (oliver@neukum.org) * v0.11 - add proto_bias option (Pete Zaitcev) * v0.12 - add hpoj.sourceforge.net ioctls (David Paschal) * v0.13 - alloc space for statusbuf (<status> not on stack); * use usb_alloc_coherent() for read buf & write buf; * none - Maintained in Linux kernel after v0.13 */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/signal.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/lp.h> #include <linux/mutex.h> #undef DEBUG #include <linux/usb.h> #include <linux/usb/ch9.h> #include <linux/ratelimit.h> /* * Version Information */ #define DRIVER_AUTHOR "Michael Gee, Pavel Machek, Vojtech Pavlik, Randy Dunlap, Pete Zaitcev, David Paschal" #define DRIVER_DESC "USB Printer Device Class driver" #define USBLP_BUF_SIZE 8192 #define USBLP_BUF_SIZE_IN 1024 #define USBLP_DEVICE_ID_SIZE 1024 /* ioctls: */ #define IOCNR_GET_DEVICE_ID 1 #define IOCNR_GET_PROTOCOLS 2 #define IOCNR_SET_PROTOCOL 3 #define IOCNR_HP_SET_CHANNEL 4 #define IOCNR_GET_BUS_ADDRESS 5 #define IOCNR_GET_VID_PID 6 #define IOCNR_SOFT_RESET 7 /* Get device_id string: */ #define LPIOC_GET_DEVICE_ID(len) _IOC(_IOC_READ, 'P', IOCNR_GET_DEVICE_ID, len) /* The following ioctls were added for http://hpoj.sourceforge.net: * Get two-int array: * [0]=current protocol * (1=USB_CLASS_PRINTER/1/1, 2=USB_CLASS_PRINTER/1/2, * 3=USB_CLASS_PRINTER/1/3), * [1]=supported protocol mask (mask&(1<<n)!=0 means * USB_CLASS_PRINTER/1/n supported): */ #define LPIOC_GET_PROTOCOLS(len) _IOC(_IOC_READ, 'P', IOCNR_GET_PROTOCOLS, len) /* * Set protocol * (arg: 1=USB_CLASS_PRINTER/1/1, 2=USB_CLASS_PRINTER/1/2, * 3=USB_CLASS_PRINTER/1/3): */ #define LPIOC_SET_PROTOCOL _IOC(_IOC_WRITE, 'P', IOCNR_SET_PROTOCOL, 0) /* Set channel number (HP Vendor-specific command): */ #define LPIOC_HP_SET_CHANNEL _IOC(_IOC_WRITE, 'P', IOCNR_HP_SET_CHANNEL, 0) /* Get two-int array: [0]=bus number, [1]=device address: */ #define LPIOC_GET_BUS_ADDRESS(len) _IOC(_IOC_READ, 'P', IOCNR_GET_BUS_ADDRESS, len) /* Get two-int array: [0]=vendor ID, [1]=product ID: */ #define LPIOC_GET_VID_PID(len) _IOC(_IOC_READ, 'P', IOCNR_GET_VID_PID, len) /* Perform class specific soft reset */ #define LPIOC_SOFT_RESET _IOC(_IOC_NONE, 'P', IOCNR_SOFT_RESET, 0); /* * A DEVICE_ID string may include the printer's serial number. * It should end with a semi-colon (';'). * An example from an HP 970C DeskJet printer is (this is one long string, * with the serial number changed): MFG:HEWLETT-PACKARD;MDL:DESKJET 970C;CMD:MLC,PCL,PML;CLASS:PRINTER;DESCRIPTION:Hewlett-Packard DeskJet 970C;SERN:US970CSEPROF;VSTATUS:$HB0$NC0,ff,DN,IDLE,CUT,K1,C0,DP,NR,KP000,CP027;VP:0800,FL,B0;VJ: ; */ /* * USB Printer Requests */ #define USBLP_REQ_GET_ID 0x00 #define USBLP_REQ_GET_STATUS 0x01 #define USBLP_REQ_RESET 0x02 #define USBLP_REQ_HP_CHANNEL_CHANGE_REQUEST 0x00 /* HP Vendor-specific */ #define USBLP_MINORS 16 #define USBLP_MINOR_BASE 0 #define USBLP_CTL_TIMEOUT 5000 /* 5 seconds */ #define USBLP_FIRST_PROTOCOL 1 #define USBLP_LAST_PROTOCOL 3 #define USBLP_MAX_PROTOCOLS (USBLP_LAST_PROTOCOL+1) /* * some arbitrary status buffer size; * need a status buffer that is allocated via kmalloc(), not on stack */ #define STATUS_BUF_SIZE 8 /* * Locks down the locking order: * ->wmut locks wstatus. * ->mut locks the whole usblp, except [rw]complete, and thus, by indirection, * [rw]status. We only touch status when we know the side idle. * ->lock locks what interrupt accesses. */ struct usblp { struct usb_device *dev; /* USB device */ struct mutex wmut; struct mutex mut; spinlock_t lock; /* locks rcomplete, wcomplete */ char *readbuf; /* read transfer_buffer */ char *statusbuf; /* status transfer_buffer */ struct usb_anchor urbs; wait_queue_head_t rwait, wwait; int readcount; /* Counter for reads */ int ifnum; /* Interface number */ struct usb_interface *intf; /* The interface */ /* * Alternate-setting numbers and endpoints for each protocol * (USB_CLASS_PRINTER/1/{index=1,2,3}) that the device supports: */ struct { int alt_setting; struct usb_endpoint_descriptor *epwrite; struct usb_endpoint_descriptor *epread; } protocol[USBLP_MAX_PROTOCOLS]; int current_protocol; int minor; /* minor number of device */ int wcomplete, rcomplete; int wstatus; /* bytes written or error */ int rstatus; /* bytes ready or error */ unsigned int quirks; /* quirks flags */ unsigned int flags; /* mode flags */ unsigned char used; /* True if open */ unsigned char present; /* True if not disconnected */ unsigned char bidir; /* interface is bidirectional */ unsigned char no_paper; /* Paper Out happened */ unsigned char *device_id_string; /* IEEE 1284 DEVICE ID string (ptr) */ /* first 2 bytes are (big-endian) length */ }; #ifdef DEBUG static void usblp_dump(struct usblp *usblp) { struct device *dev = &usblp->intf->dev; int p; dev_dbg(dev, "usblp=0x%p\n", usblp); dev_dbg(dev, "dev=0x%p\n", usblp->dev); dev_dbg(dev, "present=%d\n", usblp->present); dev_dbg(dev, "readbuf=0x%p\n", usblp->readbuf); dev_dbg(dev, "readcount=%d\n", usblp->readcount); dev_dbg(dev, "ifnum=%d\n", usblp->ifnum); for (p = USBLP_FIRST_PROTOCOL; p <= USBLP_LAST_PROTOCOL; p++) { dev_dbg(dev, "protocol[%d].alt_setting=%d\n", p, usblp->protocol[p].alt_setting); dev_dbg(dev, "protocol[%d].epwrite=%p\n", p, usblp->protocol[p].epwrite); dev_dbg(dev, "protocol[%d].epread=%p\n", p, usblp->protocol[p].epread); } dev_dbg(dev, "current_protocol=%d\n", usblp->current_protocol); dev_dbg(dev, "minor=%d\n", usblp->minor); dev_dbg(dev, "wstatus=%d\n", usblp->wstatus); dev_dbg(dev, "rstatus=%d\n", usblp->rstatus); dev_dbg(dev, "quirks=%d\n", usblp->quirks); dev_dbg(dev, "used=%d\n", usblp->used); dev_dbg(dev, "bidir=%d\n", usblp->bidir); dev_dbg(dev, "device_id_string=\"%s\"\n", usblp->device_id_string ? usblp->device_id_string + 2 : (unsigned char *)"(null)"); } #endif /* Quirks: various printer quirks are handled by this table & its flags. */ struct quirk_printer_struct { __u16 vendorId; __u16 productId; unsigned int quirks; }; #define USBLP_QUIRK_BIDIR 0x1 /* reports bidir but requires unidirectional mode (no INs/reads) */ #define USBLP_QUIRK_USB_INIT 0x2 /* needs vendor USB init string */ #define USBLP_QUIRK_BAD_CLASS 0x4 /* descriptor uses vendor-specific Class or SubClass */ static const struct quirk_printer_struct quirk_printers[] = { { 0x03f0, 0x0004, USBLP_QUIRK_BIDIR }, /* HP DeskJet 895C */ { 0x03f0, 0x0104, USBLP_QUIRK_BIDIR }, /* HP DeskJet 880C */ { 0x03f0, 0x0204, USBLP_QUIRK_BIDIR }, /* HP DeskJet 815C */ { 0x03f0, 0x0304, USBLP_QUIRK_BIDIR }, /* HP DeskJet 810C/812C */ { 0x03f0, 0x0404, USBLP_QUIRK_BIDIR }, /* HP DeskJet 830C */ { 0x03f0, 0x0504, USBLP_QUIRK_BIDIR }, /* HP DeskJet 885C */ { 0x03f0, 0x0604, USBLP_QUIRK_BIDIR }, /* HP DeskJet 840C */ { 0x03f0, 0x0804, USBLP_QUIRK_BIDIR }, /* HP DeskJet 816C */ { 0x03f0, 0x1104, USBLP_QUIRK_BIDIR }, /* HP Deskjet 959C */ { 0x0409, 0xefbe, USBLP_QUIRK_BIDIR }, /* NEC Picty900 (HP OEM) */ { 0x0409, 0xbef4, USBLP_QUIRK_BIDIR }, /* NEC Picty760 (HP OEM) */ { 0x0409, 0xf0be, USBLP_QUIRK_BIDIR }, /* NEC Picty920 (HP OEM) */ { 0x0409, 0xf1be, USBLP_QUIRK_BIDIR }, /* NEC Picty800 (HP OEM) */ { 0x0482, 0x0010, USBLP_QUIRK_BIDIR }, /* Kyocera Mita FS 820, by zut <kernel@zut.de> */ { 0x04f9, 0x000d, USBLP_QUIRK_BIDIR }, /* Brother Industries, Ltd HL-1440 Laser Printer */ { 0x04b8, 0x0202, USBLP_QUIRK_BAD_CLASS }, /* Seiko Epson Receipt Printer M129C */ { 0, 0 } }; static int usblp_wwait(struct usblp *usblp, int nonblock); static int usblp_wtest(struct usblp *usblp, int nonblock); static int usblp_rwait_and_lock(struct usblp *usblp, int nonblock); static int usblp_rtest(struct usblp *usblp, int nonblock); static int usblp_submit_read(struct usblp *usblp); static int usblp_select_alts(struct usblp *usblp); static int usblp_set_protocol(struct usblp *usblp, int protocol); static int usblp_cache_device_id_string(struct usblp *usblp); /* forward reference to make our lives easier */ static struct usb_driver usblp_driver; static DEFINE_MUTEX(usblp_mutex); /* locks the existence of usblp's */ /* * Functions for usblp control messages. */ static int usblp_ctrl_msg(struct usblp *usblp, int request, int type, int dir, int recip, int value, void *buf, int len) { int retval; int index = usblp->ifnum; /* High byte has the interface index. Low byte has the alternate setting. */ if ((request == USBLP_REQ_GET_ID) && (type == USB_TYPE_CLASS)) index = (usblp->ifnum<<8)|usblp->protocol[usblp->current_protocol].alt_setting; retval = usb_control_msg(usblp->dev, dir ? usb_rcvctrlpipe(usblp->dev, 0) : usb_sndctrlpipe(usblp->dev, 0), request, type | dir | recip, value, index, buf, len, USBLP_CTL_TIMEOUT); dev_dbg(&usblp->intf->dev, "usblp_control_msg: rq: 0x%02x dir: %d recip: %d value: %d idx: %d len: %#x result: %d\n", request, !!dir, recip, value, index, len, retval); return retval < 0 ? retval : 0; } #define usblp_read_status(usblp, status)\ usblp_ctrl_msg(usblp, USBLP_REQ_GET_STATUS, USB_TYPE_CLASS, USB_DIR_IN, USB_RECIP_INTERFACE, 0, status, 1) #define usblp_get_id(usblp, config, id, maxlen)\ usblp_ctrl_msg(usblp, USBLP_REQ_GET_ID, USB_TYPE_CLASS, USB_DIR_IN, USB_RECIP_INTERFACE, config, id, maxlen) #define usblp_reset(usblp)\ usblp_ctrl_msg(usblp, USBLP_REQ_RESET, USB_TYPE_CLASS, USB_DIR_OUT, USB_RECIP_OTHER, 0, NULL, 0) static int usblp_hp_channel_change_request(struct usblp *usblp, int channel, u8 *new_channel) { u8 *buf; int ret; buf = kzalloc(1, GFP_KERNEL); if (!buf) return -ENOMEM; ret = usblp_ctrl_msg(usblp, USBLP_REQ_HP_CHANNEL_CHANGE_REQUEST, USB_TYPE_VENDOR, USB_DIR_IN, USB_RECIP_INTERFACE, channel, buf, 1); if (ret == 0) *new_channel = buf[0]; kfree(buf); return ret; } /* * See the description for usblp_select_alts() below for the usage * explanation. Look into your /sys/kernel/debug/usb/devices and dmesg in * case of any trouble. */ static int proto_bias = -1; /* * URB callback. */ static void usblp_bulk_read(struct urb *urb) { struct usblp *usblp = urb->context; int status = urb->status; unsigned long flags; if (usblp->present && usblp->used) { if (status) printk(KERN_WARNING "usblp%d: " "nonzero read bulk status received: %d\n", usblp->minor, status); } spin_lock_irqsave(&usblp->lock, flags); if (status < 0) usblp->rstatus = status; else usblp->rstatus = urb->actual_length; usblp->rcomplete = 1; wake_up(&usblp->rwait); spin_unlock_irqrestore(&usblp->lock, flags); usb_free_urb(urb); } static void usblp_bulk_write(struct urb *urb) { struct usblp *usblp = urb->context; int status = urb->status; unsigned long flags; if (usblp->present && usblp->used) { if (status) printk(KERN_WARNING "usblp%d: " "nonzero write bulk status received: %d\n", usblp->minor, status); } spin_lock_irqsave(&usblp->lock, flags); if (status < 0) usblp->wstatus = status; else usblp->wstatus = urb->actual_length; usblp->no_paper = 0; usblp->wcomplete = 1; wake_up(&usblp->wwait); spin_unlock_irqrestore(&usblp->lock, flags); usb_free_urb(urb); } /* * Get and print printer errors. */ static const char *usblp_messages[] = { "ok", "out of paper", "off-line", "on fire" }; static int usblp_check_status(struct usblp *usblp, int err) { unsigned char status, newerr = 0; int error; mutex_lock(&usblp->mut); if ((error = usblp_read_status(usblp, usblp->statusbuf)) < 0) { mutex_unlock(&usblp->mut); printk_ratelimited(KERN_ERR "usblp%d: error %d reading printer status\n", usblp->minor, error); return 0; } status = *usblp->statusbuf; mutex_unlock(&usblp->mut); if (~status & LP_PERRORP) newerr = 3; if (status & LP_POUTPA) newerr = 1; if (~status & LP_PSELECD) newerr = 2; if (newerr != err) { printk(KERN_INFO "usblp%d: %s\n", usblp->minor, usblp_messages[newerr]); } return newerr; } static int handle_bidir(struct usblp *usblp) { if (usblp->bidir && usblp->used) { if (usblp_submit_read(usblp) < 0) return -EIO; } return 0; } /* * File op functions. */ static int usblp_open(struct inode *inode, struct file *file) { int minor = iminor(inode); struct usblp *usblp; struct usb_interface *intf; int retval; if (minor < 0) return -ENODEV; mutex_lock(&usblp_mutex); retval = -ENODEV; intf = usb_find_interface(&usblp_driver, minor); if (!intf) goto out; usblp = usb_get_intfdata(intf); if (!usblp || !usblp->dev || !usblp->present) goto out; retval = -EBUSY; if (usblp->used) goto out; /* * We do not implement LP_ABORTOPEN/LPABORTOPEN for two reasons: * - We do not want persistent state which close(2) does not clear * - It is not used anyway, according to CUPS people */ retval = usb_autopm_get_interface(intf); if (retval < 0) goto out; usblp->used = 1; file->private_data = usblp; usblp->wcomplete = 1; /* we begin writeable */ usblp->wstatus = 0; usblp->rcomplete = 0; if (handle_bidir(usblp) < 0) { usb_autopm_put_interface(intf); usblp->used = 0; file->private_data = NULL; retval = -EIO; } out: mutex_unlock(&usblp_mutex); return retval; } static void usblp_cleanup(struct usblp *usblp) { printk(KERN_INFO "usblp%d: removed\n", usblp->minor); kfree(usblp->readbuf); kfree(usblp->device_id_string); kfree(usblp->statusbuf); usb_put_intf(usblp->intf); kfree(usblp); } static void usblp_unlink_urbs(struct usblp *usblp) { usb_kill_anchored_urbs(&usblp->urbs); } static int usblp_release(struct inode *inode, struct file *file) { struct usblp *usblp = file->private_data; usblp->flags &= ~LP_ABORT; mutex_lock(&usblp_mutex); usblp->used = 0; if (usblp->present) usblp_unlink_urbs(usblp); usb_autopm_put_interface(usblp->intf); if (!usblp->present) /* finish cleanup from disconnect */ usblp_cleanup(usblp); /* any URBs must be dead */ mutex_unlock(&usblp_mutex); return 0; } /* No kernel lock - fine */ static __poll_t usblp_poll(struct file *file, struct poll_table_struct *wait) { struct usblp *usblp = file->private_data; __poll_t ret = 0; unsigned long flags; /* Should we check file->f_mode & FMODE_WRITE before poll_wait()? */ poll_wait(file, &usblp->rwait, wait); poll_wait(file, &usblp->wwait, wait); mutex_lock(&usblp->mut); if (!usblp->present) ret |= EPOLLHUP; mutex_unlock(&usblp->mut); spin_lock_irqsave(&usblp->lock, flags); if (usblp->bidir && usblp->rcomplete) ret |= EPOLLIN | EPOLLRDNORM; if (usblp->no_paper || usblp->wcomplete) ret |= EPOLLOUT | EPOLLWRNORM; spin_unlock_irqrestore(&usblp->lock, flags); return ret; } static long usblp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct usblp *usblp = file->private_data; int length, err, i; unsigned char newChannel; int status; int twoints[2]; int retval = 0; mutex_lock(&usblp->mut); if (!usblp->present) { retval = -ENODEV; goto done; } dev_dbg(&usblp->intf->dev, "usblp_ioctl: cmd=0x%x (%c nr=%d len=%d dir=%d)\n", cmd, _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd), _IOC_DIR(cmd)); if (_IOC_TYPE(cmd) == 'P') /* new-style ioctl number */ switch (_IOC_NR(cmd)) { case IOCNR_GET_DEVICE_ID: /* get the DEVICE_ID string */ if (_IOC_DIR(cmd) != _IOC_READ) { retval = -EINVAL; goto done; } length = usblp_cache_device_id_string(usblp); if (length < 0) { retval = length; goto done; } if (length > _IOC_SIZE(cmd)) length = _IOC_SIZE(cmd); /* truncate */ if (copy_to_user((void __user *) arg, usblp->device_id_string, (unsigned long) length)) { retval = -EFAULT; goto done; } break; case IOCNR_GET_PROTOCOLS: if (_IOC_DIR(cmd) != _IOC_READ || _IOC_SIZE(cmd) < sizeof(twoints)) { retval = -EINVAL; goto done; } twoints[0] = usblp->current_protocol; twoints[1] = 0; for (i = USBLP_FIRST_PROTOCOL; i <= USBLP_LAST_PROTOCOL; i++) { if (usblp->protocol[i].alt_setting >= 0) twoints[1] |= (1<<i); } if (copy_to_user((void __user *)arg, (unsigned char *)twoints, sizeof(twoints))) { retval = -EFAULT; goto done; } break; case IOCNR_SET_PROTOCOL: if (_IOC_DIR(cmd) != _IOC_WRITE) { retval = -EINVAL; goto done; } #ifdef DEBUG if (arg == -10) { usblp_dump(usblp); break; } #endif usblp_unlink_urbs(usblp); retval = usblp_set_protocol(usblp, arg); if (retval < 0) { usblp_set_protocol(usblp, usblp->current_protocol); } break; case IOCNR_HP_SET_CHANNEL: if (_IOC_DIR(cmd) != _IOC_WRITE || le16_to_cpu(usblp->dev->descriptor.idVendor) != 0x03F0 || usblp->quirks & USBLP_QUIRK_BIDIR) { retval = -EINVAL; goto done; } err = usblp_hp_channel_change_request(usblp, arg, &newChannel); if (err < 0) { dev_err(&usblp->dev->dev, "usblp%d: error = %d setting " "HP channel\n", usblp->minor, err); retval = -EIO; goto done; } dev_dbg(&usblp->intf->dev, "usblp%d requested/got HP channel %ld/%d\n", usblp->minor, arg, newChannel); break; case IOCNR_GET_BUS_ADDRESS: if (_IOC_DIR(cmd) != _IOC_READ || _IOC_SIZE(cmd) < sizeof(twoints)) { retval = -EINVAL; goto done; } twoints[0] = usblp->dev->bus->busnum; twoints[1] = usblp->dev->devnum; if (copy_to_user((void __user *)arg, (unsigned char *)twoints, sizeof(twoints))) { retval = -EFAULT; goto done; } dev_dbg(&usblp->intf->dev, "usblp%d is bus=%d, device=%d\n", usblp->minor, twoints[0], twoints[1]); break; case IOCNR_GET_VID_PID: if (_IOC_DIR(cmd) != _IOC_READ || _IOC_SIZE(cmd) < sizeof(twoints)) { retval = -EINVAL; goto done; } twoints[0] = le16_to_cpu(usblp->dev->descriptor.idVendor); twoints[1] = le16_to_cpu(usblp->dev->descriptor.idProduct); if (copy_to_user((void __user *)arg, (unsigned char *)twoints, sizeof(twoints))) { retval = -EFAULT; goto done; } dev_dbg(&usblp->intf->dev, "usblp%d is VID=0x%4.4X, PID=0x%4.4X\n", usblp->minor, twoints[0], twoints[1]); break; case IOCNR_SOFT_RESET: if (_IOC_DIR(cmd) != _IOC_NONE) { retval = -EINVAL; goto done; } retval = usblp_reset(usblp); break; default: retval = -ENOTTY; } else /* old-style ioctl value */ switch (cmd) { case LPGETSTATUS: retval = usblp_read_status(usblp, usblp->statusbuf); if (retval) { printk_ratelimited(KERN_ERR "usblp%d:" "failed reading printer status (%d)\n", usblp->minor, retval); retval = -EIO; goto done; } status = *usblp->statusbuf; if (copy_to_user((void __user *)arg, &status, sizeof(int))) retval = -EFAULT; break; case LPABORT: if (arg) usblp->flags |= LP_ABORT; else usblp->flags &= ~LP_ABORT; break; default: retval = -ENOTTY; } done: mutex_unlock(&usblp->mut); return retval; } static struct urb *usblp_new_writeurb(struct usblp *usblp, int transfer_length) { struct urb *urb; char *writebuf; writebuf = kmalloc(transfer_length, GFP_KERNEL); if (writebuf == NULL) return NULL; urb = usb_alloc_urb(0, GFP_KERNEL); if (urb == NULL) { kfree(writebuf); return NULL; } usb_fill_bulk_urb(urb, usblp->dev, usb_sndbulkpipe(usblp->dev, usblp->protocol[usblp->current_protocol].epwrite->bEndpointAddress), writebuf, transfer_length, usblp_bulk_write, usblp); urb->transfer_flags |= URB_FREE_BUFFER; return urb; } static ssize_t usblp_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct usblp *usblp = file->private_data; struct urb *writeurb; int rv; int transfer_length; ssize_t writecount = 0; if (mutex_lock_interruptible(&usblp->wmut)) { rv = -EINTR; goto raise_biglock; } if ((rv = usblp_wwait(usblp, !!(file->f_flags & O_NONBLOCK))) < 0) goto raise_wait; while (writecount < count) { /* * Step 1: Submit next block. */ if ((transfer_length = count - writecount) > USBLP_BUF_SIZE) transfer_length = USBLP_BUF_SIZE; rv = -ENOMEM; writeurb = usblp_new_writeurb(usblp, transfer_length); if (writeurb == NULL) goto raise_urb; usb_anchor_urb(writeurb, &usblp->urbs); if (copy_from_user(writeurb->transfer_buffer, buffer + writecount, transfer_length)) { rv = -EFAULT; goto raise_badaddr; } spin_lock_irq(&usblp->lock); usblp->wcomplete = 0; spin_unlock_irq(&usblp->lock); if ((rv = usb_submit_urb(writeurb, GFP_KERNEL)) < 0) { usblp->wstatus = 0; spin_lock_irq(&usblp->lock); usblp->no_paper = 0; usblp->wcomplete = 1; wake_up(&usblp->wwait); spin_unlock_irq(&usblp->lock); if (rv != -ENOMEM) rv = -EIO; goto raise_submit; } /* * Step 2: Wait for transfer to end, collect results. */ rv = usblp_wwait(usblp, !!(file->f_flags&O_NONBLOCK)); if (rv < 0) { if (rv == -EAGAIN) { /* Presume that it's going to complete well. */ writecount += transfer_length; } if (rv == -ENOSPC) { spin_lock_irq(&usblp->lock); usblp->no_paper = 1; /* Mark for poll(2) */ spin_unlock_irq(&usblp->lock); writecount += transfer_length; } /* Leave URB dangling, to be cleaned on close. */ goto collect_error; } if (usblp->wstatus < 0) { rv = -EIO; goto collect_error; } /* * This is critical: it must be our URB, not other writer's. * The wmut exists mainly to cover us here. */ writecount += usblp->wstatus; } mutex_unlock(&usblp->wmut); return writecount; raise_submit: raise_badaddr: usb_unanchor_urb(writeurb); usb_free_urb(writeurb); raise_urb: raise_wait: collect_error: /* Out of raise sequence */ mutex_unlock(&usblp->wmut); raise_biglock: return writecount ? writecount : rv; } /* * Notice that we fail to restart in a few cases: on EFAULT, on restart * error, etc. This is the historical behaviour. In all such cases we return * EIO, and applications loop in order to get the new read going. */ static ssize_t usblp_read(struct file *file, char __user *buffer, size_t len, loff_t *ppos) { struct usblp *usblp = file->private_data; ssize_t count; ssize_t avail; int rv; if (!usblp->bidir) return -EINVAL; rv = usblp_rwait_and_lock(usblp, !!(file->f_flags & O_NONBLOCK)); if (rv < 0) return rv; if (!usblp->present) { count = -ENODEV; goto done; } if ((avail = usblp->rstatus) < 0) { printk(KERN_ERR "usblp%d: error %d reading from printer\n", usblp->minor, (int)avail); usblp_submit_read(usblp); count = -EIO; goto done; } count = len < avail - usblp->readcount ? len : avail - usblp->readcount; if (count != 0 && copy_to_user(buffer, usblp->readbuf + usblp->readcount, count)) { count = -EFAULT; goto done; } if ((usblp->readcount += count) == avail) { if (usblp_submit_read(usblp) < 0) { /* We don't want to leak USB return codes into errno. */ if (count == 0) count = -EIO; goto done; } } done: mutex_unlock(&usblp->mut); return count; } /* * Wait for the write path to come idle. * This is called under the ->wmut, so the idle path stays idle. * * Our write path has a peculiar property: it does not buffer like a tty, * but waits for the write to succeed. This allows our ->release to bug out * without waiting for writes to drain. But it obviously does not work * when O_NONBLOCK is set. So, applications setting O_NONBLOCK must use * select(2) or poll(2) to wait for the buffer to drain before closing. * Alternatively, set blocking mode with fcntl and issue a zero-size write. */ static int usblp_wwait(struct usblp *usblp, int nonblock) { DECLARE_WAITQUEUE(waita, current); int rc; int err = 0; add_wait_queue(&usblp->wwait, &waita); for (;;) { if (mutex_lock_interruptible(&usblp->mut)) { rc = -EINTR; break; } set_current_state(TASK_INTERRUPTIBLE); rc = usblp_wtest(usblp, nonblock); mutex_unlock(&usblp->mut); if (rc <= 0) break; if (schedule_timeout(msecs_to_jiffies(1500)) == 0) { if (usblp->flags & LP_ABORT) { err = usblp_check_status(usblp, err); if (err == 1) { /* Paper out */ rc = -ENOSPC; break; } } else { /* Prod the printer, Gentoo#251237. */ mutex_lock(&usblp->mut); usblp_read_status(usblp, usblp->statusbuf); mutex_unlock(&usblp->mut); } } } set_current_state(TASK_RUNNING); remove_wait_queue(&usblp->wwait, &waita); return rc; } static int usblp_wtest(struct usblp *usblp, int nonblock) { unsigned long flags; if (!usblp->present) return -ENODEV; if (signal_pending(current)) return -EINTR; spin_lock_irqsave(&usblp->lock, flags); if (usblp->wcomplete) { spin_unlock_irqrestore(&usblp->lock, flags); return 0; } spin_unlock_irqrestore(&usblp->lock, flags); if (nonblock) return -EAGAIN; return 1; } /* * Wait for read bytes to become available. This probably should have been * called usblp_r_lock_and_wait(), because we lock first. But it's a traditional * name for functions which lock and return. * * We do not use wait_event_interruptible because it makes locking iffy. */ static int usblp_rwait_and_lock(struct usblp *usblp, int nonblock) { DECLARE_WAITQUEUE(waita, current); int rc; add_wait_queue(&usblp->rwait, &waita); for (;;) { if (mutex_lock_interruptible(&usblp->mut)) { rc = -EINTR; break; } set_current_state(TASK_INTERRUPTIBLE); if ((rc = usblp_rtest(usblp, nonblock)) < 0) { mutex_unlock(&usblp->mut); break; } if (rc == 0) /* Keep it locked */ break; mutex_unlock(&usblp->mut); schedule(); } set_current_state(TASK_RUNNING); remove_wait_queue(&usblp->rwait, &waita); return rc; } static int usblp_rtest(struct usblp *usblp, int nonblock) { unsigned long flags; if (!usblp->present) return -ENODEV; if (signal_pending(current)) return -EINTR; spin_lock_irqsave(&usblp->lock, flags); if (usblp->rcomplete) { spin_unlock_irqrestore(&usblp->lock, flags); return 0; } spin_unlock_irqrestore(&usblp->lock, flags); if (nonblock) return -EAGAIN; return 1; } /* * Please check ->bidir and other such things outside for now. */ static int usblp_submit_read(struct usblp *usblp) { struct urb *urb; unsigned long flags; int rc; rc = -ENOMEM; urb = usb_alloc_urb(0, GFP_KERNEL); if (urb == NULL) goto raise_urb; usb_fill_bulk_urb(urb, usblp->dev, usb_rcvbulkpipe(usblp->dev, usblp->protocol[usblp->current_protocol].epread->bEndpointAddress), usblp->readbuf, USBLP_BUF_SIZE_IN, usblp_bulk_read, usblp); usb_anchor_urb(urb, &usblp->urbs); spin_lock_irqsave(&usblp->lock, flags); usblp->readcount = 0; /* XXX Why here? */ usblp->rcomplete = 0; spin_unlock_irqrestore(&usblp->lock, flags); if ((rc = usb_submit_urb(urb, GFP_KERNEL)) < 0) { dev_dbg(&usblp->intf->dev, "error submitting urb (%d)\n", rc); spin_lock_irqsave(&usblp->lock, flags); usblp->rstatus = rc; usblp->rcomplete = 1; spin_unlock_irqrestore(&usblp->lock, flags); goto raise_submit; } return 0; raise_submit: usb_unanchor_urb(urb); usb_free_urb(urb); raise_urb: return rc; } /* * Checks for printers that have quirks, such as requiring unidirectional * communication but reporting bidirectional; currently some HP printers * have this flaw (HP 810, 880, 895, etc.), or needing an init string * sent at each open (like some Epsons). * Returns 1 if found, 0 if not found. * * HP recommended that we use the bidirectional interface but * don't attempt any bulk IN transfers from the IN endpoint. * Here's some more detail on the problem: * The problem is not that it isn't bidirectional though. The problem * is that if you request a device ID, or status information, while * the buffers are full, the return data will end up in the print data * buffer. For example if you make sure you never request the device ID * while you are sending print data, and you don't try to query the * printer status every couple of milliseconds, you will probably be OK. */ static unsigned int usblp_quirks(__u16 vendor, __u16 product) { int i; for (i = 0; quirk_printers[i].vendorId; i++) { if (vendor == quirk_printers[i].vendorId && product == quirk_printers[i].productId) return quirk_printers[i].quirks; } return 0; } static const struct file_operations usblp_fops = { .owner = THIS_MODULE, .read = usblp_read, .write = usblp_write, .poll = usblp_poll, .unlocked_ioctl = usblp_ioctl, .compat_ioctl = usblp_ioctl, .open = usblp_open, .release = usblp_release, .llseek = noop_llseek, }; static char *usblp_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev)); } static struct usb_class_driver usblp_class = { .name = "lp%d", .devnode = usblp_devnode, .fops = &usblp_fops, .minor_base = USBLP_MINOR_BASE, }; static ssize_t ieee1284_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct usblp *usblp = usb_get_intfdata(intf); if (usblp->device_id_string[0] == 0 && usblp->device_id_string[1] == 0) return 0; return sprintf(buf, "%s", usblp->device_id_string+2); } static DEVICE_ATTR_RO(ieee1284_id); static struct attribute *usblp_attrs[] = { &dev_attr_ieee1284_id.attr, NULL, }; ATTRIBUTE_GROUPS(usblp); static int usblp_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *dev = interface_to_usbdev(intf); struct usblp *usblp; int protocol; int retval; /* Malloc and start initializing usblp structure so we can use it * directly. */ usblp = kzalloc(sizeof(struct usblp), GFP_KERNEL); if (!usblp) { retval = -ENOMEM; goto abort_ret; } usblp->dev = dev; mutex_init(&usblp->wmut); mutex_init(&usblp->mut); spin_lock_init(&usblp->lock); init_waitqueue_head(&usblp->rwait); init_waitqueue_head(&usblp->wwait); init_usb_anchor(&usblp->urbs); usblp->ifnum = intf->cur_altsetting->desc.bInterfaceNumber; usblp->intf = usb_get_intf(intf); /* Malloc device ID string buffer to the largest expected length, * since we can re-query it on an ioctl and a dynamic string * could change in length. */ if (!(usblp->device_id_string = kmalloc(USBLP_DEVICE_ID_SIZE, GFP_KERNEL))) { retval = -ENOMEM; goto abort; } /* * Allocate read buffer. We somewhat wastefully * malloc both regardless of bidirectionality, because the * alternate setting can be changed later via an ioctl. */ if (!(usblp->readbuf = kmalloc(USBLP_BUF_SIZE_IN, GFP_KERNEL))) { retval = -ENOMEM; goto abort; } /* Allocate buffer for printer status */ usblp->statusbuf = kmalloc(STATUS_BUF_SIZE, GFP_KERNEL); if (!usblp->statusbuf) { retval = -ENOMEM; goto abort; } /* Lookup quirks for this printer. */ usblp->quirks = usblp_quirks( le16_to_cpu(dev->descriptor.idVendor), le16_to_cpu(dev->descriptor.idProduct)); /* Analyze and pick initial alternate settings and endpoints. */ protocol = usblp_select_alts(usblp); if (protocol < 0) { dev_dbg(&intf->dev, "incompatible printer-class device 0x%4.4X/0x%4.4X\n", le16_to_cpu(dev->descriptor.idVendor), le16_to_cpu(dev->descriptor.idProduct)); retval = -ENODEV; goto abort; } /* Setup the selected alternate setting and endpoints. */ if (usblp_set_protocol(usblp, protocol) < 0) { retval = -ENODEV; /* ->probe isn't ->ioctl */ goto abort; } /* Retrieve and store the device ID string. */ usblp_cache_device_id_string(usblp); #ifdef DEBUG usblp_check_status(usblp, 0); #endif usb_set_intfdata(intf, usblp); usblp->present = 1; retval = usb_register_dev(intf, &usblp_class); if (retval) { dev_err(&intf->dev, "usblp: Not able to get a minor (base %u, slice default): %d\n", USBLP_MINOR_BASE, retval); goto abort_intfdata; } usblp->minor = intf->minor; dev_info(&intf->dev, "usblp%d: USB %sdirectional printer dev %d if %d alt %d proto %d vid 0x%4.4X pid 0x%4.4X\n", usblp->minor, usblp->bidir ? "Bi" : "Uni", dev->devnum, usblp->ifnum, usblp->protocol[usblp->current_protocol].alt_setting, usblp->current_protocol, le16_to_cpu(usblp->dev->descriptor.idVendor), le16_to_cpu(usblp->dev->descriptor.idProduct)); return 0; abort_intfdata: usb_set_intfdata(intf, NULL); abort: kfree(usblp->readbuf); kfree(usblp->statusbuf); kfree(usblp->device_id_string); usb_put_intf(usblp->intf); kfree(usblp); abort_ret: return retval; } /* * We are a "new" style driver with usb_device_id table, * but our requirements are too intricate for simple match to handle. * * The "proto_bias" option may be used to specify the preferred protocol * for all USB printers (1=USB_CLASS_PRINTER/1/1, 2=USB_CLASS_PRINTER/1/2, * 3=USB_CLASS_PRINTER/1/3). If the device supports the preferred protocol, * then we bind to it. * * The best interface for us is USB_CLASS_PRINTER/1/2, because it * is compatible with a stream of characters. If we find it, we bind to it. * * Note that the people from hpoj.sourceforge.net need to be able to * bind to USB_CLASS_PRINTER/1/3 (MLC/1284.4), so we provide them ioctls * for this purpose. * * Failing USB_CLASS_PRINTER/1/2, we look for USB_CLASS_PRINTER/1/3, * even though it's probably not stream-compatible, because this matches * the behaviour of the old code. * * If nothing else, we bind to USB_CLASS_PRINTER/1/1 * - the unidirectional interface. */ static int usblp_select_alts(struct usblp *usblp) { struct usb_interface *if_alt; struct usb_host_interface *ifd; struct usb_endpoint_descriptor *epwrite, *epread; int p, i; int res; if_alt = usblp->intf; for (p = 0; p < USBLP_MAX_PROTOCOLS; p++) usblp->protocol[p].alt_setting = -1; /* Find out what we have. */ for (i = 0; i < if_alt->num_altsetting; i++) { ifd = &if_alt->altsetting[i]; if (ifd->desc.bInterfaceClass != USB_CLASS_PRINTER || ifd->desc.bInterfaceSubClass != 1) if (!(usblp->quirks & USBLP_QUIRK_BAD_CLASS)) continue; if (ifd->desc.bInterfaceProtocol < USBLP_FIRST_PROTOCOL || ifd->desc.bInterfaceProtocol > USBLP_LAST_PROTOCOL) continue; /* Look for the expected bulk endpoints. */ if (ifd->desc.bInterfaceProtocol > 1) { res = usb_find_common_endpoints(ifd, &epread, &epwrite, NULL, NULL); } else { epread = NULL; res = usb_find_bulk_out_endpoint(ifd, &epwrite); } /* Ignore buggy hardware without the right endpoints. */ if (res) continue; /* Turn off reads for buggy bidirectional printers. */ if (usblp->quirks & USBLP_QUIRK_BIDIR) { printk(KERN_INFO "usblp%d: Disabling reads from " "problematic bidirectional printer\n", usblp->minor); epread = NULL; } usblp->protocol[ifd->desc.bInterfaceProtocol].alt_setting = ifd->desc.bAlternateSetting; usblp->protocol[ifd->desc.bInterfaceProtocol].epwrite = epwrite; usblp->protocol[ifd->desc.bInterfaceProtocol].epread = epread; } /* If our requested protocol is supported, then use it. */ if (proto_bias >= USBLP_FIRST_PROTOCOL && proto_bias <= USBLP_LAST_PROTOCOL && usblp->protocol[proto_bias].alt_setting != -1) return proto_bias; /* Ordering is important here. */ if (usblp->protocol[2].alt_setting != -1) return 2; if (usblp->protocol[1].alt_setting != -1) return 1; if (usblp->protocol[3].alt_setting != -1) return 3; /* If nothing is available, then don't bind to this device. */ return -1; } static int usblp_set_protocol(struct usblp *usblp, int protocol) { int r, alts; if (protocol < USBLP_FIRST_PROTOCOL || protocol > USBLP_LAST_PROTOCOL) return -EINVAL; /* Don't unnecessarily set the interface if there's a single alt. */ if (usblp->intf->num_altsetting > 1) { alts = usblp->protocol[protocol].alt_setting; if (alts < 0) return -EINVAL; r = usb_set_interface(usblp->dev, usblp->ifnum, alts); if (r < 0) { printk(KERN_ERR "usblp: can't set desired altsetting %d on interface %d\n", alts, usblp->ifnum); return r; } } usblp->bidir = (usblp->protocol[protocol].epread != NULL); usblp->current_protocol = protocol; dev_dbg(&usblp->intf->dev, "usblp%d set protocol %d\n", usblp->minor, protocol); return 0; } /* Retrieves and caches device ID string. * Returns length, including length bytes but not null terminator. * On error, returns a negative errno value. */ static int usblp_cache_device_id_string(struct usblp *usblp) { int err, length; err = usblp_get_id(usblp, 0, usblp->device_id_string, USBLP_DEVICE_ID_SIZE - 1); if (err < 0) { dev_dbg(&usblp->intf->dev, "usblp%d: error = %d reading IEEE-1284 Device ID string\n", usblp->minor, err); usblp->device_id_string[0] = usblp->device_id_string[1] = '\0'; return -EIO; } /* First two bytes are length in big-endian. * They count themselves, and we copy them into * the user's buffer. */ length = be16_to_cpu(*((__be16 *)usblp->device_id_string)); if (length < 2) length = 2; else if (length >= USBLP_DEVICE_ID_SIZE) length = USBLP_DEVICE_ID_SIZE - 1; usblp->device_id_string[length] = '\0'; dev_dbg(&usblp->intf->dev, "usblp%d Device ID string [len=%d]=\"%s\"\n", usblp->minor, length, &usblp->device_id_string[2]); return length; } static void usblp_disconnect(struct usb_interface *intf) { struct usblp *usblp = usb_get_intfdata(intf); usb_deregister_dev(intf, &usblp_class); if (!usblp || !usblp->dev) { dev_err(&intf->dev, "bogus disconnect\n"); BUG(); } mutex_lock(&usblp_mutex); mutex_lock(&usblp->mut); usblp->present = 0; wake_up(&usblp->wwait); wake_up(&usblp->rwait); usb_set_intfdata(intf, NULL); usblp_unlink_urbs(usblp); mutex_unlock(&usblp->mut); usb_poison_anchored_urbs(&usblp->urbs); if (!usblp->used) usblp_cleanup(usblp); mutex_unlock(&usblp_mutex); } static int usblp_suspend(struct usb_interface *intf, pm_message_t message) { struct usblp *usblp = usb_get_intfdata(intf); usblp_unlink_urbs(usblp); #if 0 /* XXX Do we want this? What if someone is reading, should we fail? */ /* not strictly necessary, but just in case */ wake_up(&usblp->wwait); wake_up(&usblp->rwait); #endif return 0; } static int usblp_resume(struct usb_interface *intf) { struct usblp *usblp = usb_get_intfdata(intf); int r; r = handle_bidir(usblp); return r; } static const struct usb_device_id usblp_ids[] = { { USB_DEVICE_INFO(USB_CLASS_PRINTER, 1, 1) }, { USB_DEVICE_INFO(USB_CLASS_PRINTER, 1, 2) }, { USB_DEVICE_INFO(USB_CLASS_PRINTER, 1, 3) }, { USB_INTERFACE_INFO(USB_CLASS_PRINTER, 1, 1) }, { USB_INTERFACE_INFO(USB_CLASS_PRINTER, 1, 2) }, { USB_INTERFACE_INFO(USB_CLASS_PRINTER, 1, 3) }, { USB_DEVICE(0x04b8, 0x0202) }, /* Seiko Epson Receipt Printer M129C */ { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, usblp_ids); static struct usb_driver usblp_driver = { .name = "usblp", .probe = usblp_probe, .disconnect = usblp_disconnect, .suspend = usblp_suspend, .resume = usblp_resume, .id_table = usblp_ids, .dev_groups = usblp_groups, .supports_autosuspend = 1, }; module_usb_driver(usblp_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); module_param(proto_bias, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(proto_bias, "Favourite protocol number"); MODULE_LICENSE("GPL");
78 78 78 37 78 78 78 78 78 78 75 38 33 7 7 31 7 33 32 5 32 24 33 7 7 33 34 34 33 34 33 33 33 33 33 33 33 33 33 32 33 33 33 33 33 33 33 2 2 2 2 2 2 2 2 1 2 1 1 1 1 25 25 25 37 33 13 32 13 13 32 32 2 2792 78 7 5 37 2812 37 7 37 37 37 37 2817 2798 2819 2819 2819 7 7 2816 2816 2792 36 35 78 12 78 73 77 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King * Copyright (C) 2020 Christoph Hellwig */ #include <linux/fs.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/vmalloc.h> #include <linux/raid/detect.h> #include "check.h" static int (*const check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. */ #ifdef CONFIG_ACORN_PARTITION_ICS adfspart_check_ICS, #endif #ifdef CONFIG_ACORN_PARTITION_POWERTEC adfspart_check_POWERTEC, #endif #ifdef CONFIG_ACORN_PARTITION_EESOX adfspart_check_EESOX, #endif /* * Now move on to formats that only have partition info at * disk address 0xdc0. Since these may also have stale * PC/BIOS partition tables, they need to come before * the msdos entry. */ #ifdef CONFIG_ACORN_PARTITION_CUMANA adfspart_check_CUMANA, #endif #ifdef CONFIG_ACORN_PARTITION_ADFS adfspart_check_ADFS, #endif #ifdef CONFIG_CMDLINE_PARTITION cmdline_partition, #endif #ifdef CONFIG_OF_PARTITION of_partition, /* cmdline have priority to OF */ #endif #ifdef CONFIG_EFI_PARTITION efi_partition, /* this must come before msdos */ #endif #ifdef CONFIG_SGI_PARTITION sgi_partition, #endif #ifdef CONFIG_LDM_PARTITION ldm_partition, /* this must come before msdos */ #endif #ifdef CONFIG_MSDOS_PARTITION msdos_partition, #endif #ifdef CONFIG_OSF_PARTITION osf_partition, #endif #ifdef CONFIG_SUN_PARTITION sun_partition, #endif #ifdef CONFIG_AMIGA_PARTITION amiga_partition, #endif #ifdef CONFIG_ATARI_PARTITION atari_partition, #endif #ifdef CONFIG_MAC_PARTITION mac_partition, #endif #ifdef CONFIG_ULTRIX_PARTITION ultrix_partition, #endif #ifdef CONFIG_IBM_PARTITION ibm_partition, #endif #ifdef CONFIG_KARMA_PARTITION karma_partition, #endif #ifdef CONFIG_SYSV68_PARTITION sysv68_partition, #endif NULL }; static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; int nr = DISK_MAX_PARTS; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); if (!state->parts) { kfree(state); return NULL; } state->limit = nr; return state; } static void free_partitions(struct parsed_partitions *state) { vfree(state->parts); kfree(state); } static struct parsed_partitions *check_partition(struct gendisk *hd) { struct parsed_partitions *state; int i, res, err; state = allocate_partitions(hd); if (!state) return NULL; state->pp_buf = (char *)__get_free_page(GFP_KERNEL); if (!state->pp_buf) { free_partitions(state); return NULL; } state->pp_buf[0] = '\0'; state->disk = hd; snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); i = res = err = 0; while (!res && check_part[i]) { memset(state->parts, 0, state->limit * sizeof(state->parts[0])); res = check_part[i++](state); if (res < 0) { /* * We have hit an I/O error which we don't report now. * But record it, and let the others do their job. */ err = res; res = 0; } } if (res > 0) { printk(KERN_INFO "%s", state->pp_buf); free_page((unsigned long)state->pp_buf); return state; } if (state->access_beyond_eod) err = -ENOSPC; /* * The partition is unrecognized. So report I/O errors if there were any */ if (err) res = err; if (res) { strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); printk(KERN_INFO "%s", state->pp_buf); } free_page((unsigned long)state->pp_buf); free_partitions(state); return ERR_PTR(res); } static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev))); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); static DEVICE_ATTR(start, 0444, part_start_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); static DEVICE_ATTR(ro, 0444, part_ro_show, NULL); static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif static struct attribute *part_attrs[] = { &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_ro.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif NULL }; static const struct attribute_group part_attr_group = { .attrs = part_attrs, }; static const struct attribute_group *part_attr_groups[] = { &part_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, #endif NULL }; static void part_release(struct device *dev) { put_disk(dev_to_bdev(dev)->bd_disk); bdev_drop(dev_to_bdev(dev)); } static int part_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct block_device *part = dev_to_bdev(dev); add_uevent_var(env, "PARTN=%u", bdev_partno(part)); if (part->bd_meta_info && part->bd_meta_info->volname[0]) add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); if (part->bd_meta_info && part->bd_meta_info->uuid[0]) add_uevent_var(env, "PARTUUID=%s", part->bd_meta_info->uuid); return 0; } const struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, .uevent = part_uevent, }; void drop_partition(struct block_device *part) { lockdep_assert_held(&part->bd_disk->open_mutex); xa_erase(&part->bd_disk->part_tbl, bdev_partno(part)); kobject_put(part->bd_holder_dir); device_del(&part->bd_device); put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, struct device_attribute *attr, char *buf) { return 0; } static const DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; struct block_device *bdev; const char *dname; int err; lockdep_assert_held(&disk->open_mutex); if (partno >= DISK_MAX_PARTS) return ERR_PTR(-EINVAL); /* * Partitions are not supported on zoned block devices that are used as * such. */ if (bdev_is_zoned(disk->part0)) { pr_warn("%s: partitions not supported on host managed zoned block device\n", disk->disk_name); return ERR_PTR(-ENXIO); } if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); /* ensure we always have a reference to the whole disk */ get_device(disk_to_dev(disk)); err = -ENOMEM; bdev = bdev_alloc(disk, partno); if (!bdev) goto out_put_disk; bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); else dev_set_name(pdev, "%s%d", dname, partno); device_initialize(pdev); pdev->class = &block_class; pdev->type = &part_type; pdev->parent = ddev; /* in consecutive minor range? */ if (bdev_partno(bdev) < disk->minors) { devt = MKDEV(disk->major, disk->first_minor + bdev_partno(bdev)); } else { err = blk_alloc_ext_minor(); if (err < 0) goto out_put; devt = MKDEV(BLOCK_EXT_MAJOR, err); } pdev->devt = devt; if (info) { err = -ENOMEM; bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); if (!bdev->bd_meta_info) goto out_put; } /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); if (err) goto out_put; err = -ENOMEM; bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj); if (!bdev->bd_holder_dir) goto out_del; dev_set_uevent_suppress(pdev, 0); if (flags & ADDPART_FLAG_WHOLEDISK) { err = device_create_file(pdev, &dev_attr_whole_disk); if (err) goto out_del; } if (flags & ADDPART_FLAG_READONLY) bdev_set_flag(bdev, BD_READ_ONLY); /* everything is up and running, commence */ err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); if (err) goto out_del; bdev_add(bdev, devt); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); return bdev; out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); return ERR_PTR(err); out_put_disk: put_disk(disk); return ERR_PTR(err); } static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { struct block_device *part; bool overlap = false; unsigned long idx; rcu_read_lock(); xa_for_each_start(&disk->part_tbl, idx, part, 1) { if (bdev_partno(part) != skip_partno && start < part->bd_start_sect + bdev_nr_sectors(part) && start + length > part->bd_start_sect) { overlap = true; break; } } rcu_read_unlock(); return overlap; } int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { struct block_device *part; int ret; mutex_lock(&disk->open_mutex); if (!disk_live(disk)) { ret = -ENXIO; goto out; } if (disk->flags & GENHD_FL_NO_PART) { ret = -EINVAL; goto out; } if (partition_overlaps(disk, start, length, -1)) { ret = -EBUSY; goto out; } part = add_partition(disk, partno, start, length, ADDPART_FLAG_NONE, NULL); ret = PTR_ERR_OR_ZERO(part); out: mutex_unlock(&disk->open_mutex); return ret; } int bdev_del_partition(struct gendisk *disk, int partno) { struct block_device *part = NULL; int ret = -ENXIO; mutex_lock(&disk->open_mutex); part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EBUSY; if (atomic_read(&part->bd_openers)) goto out_unlock; /* * We verified that @part->bd_openers is zero above and so * @part->bd_holder{_ops} can't be set. And since we hold * @disk->open_mutex the device can't be claimed by anyone. * * So no need to call @part->bd_holder_ops->mark_dead() here. * Just delete the partition and invalidate it. */ bdev_unhash(part); invalidate_bdev(part); drop_partition(part); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); return ret; } int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { struct block_device *part = NULL; int ret = -ENXIO; mutex_lock(&disk->open_mutex); part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EINVAL; if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; if (partition_overlaps(disk, start, length, partno)) goto out_unlock; bdev_set_nr_sectors(part, length); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); return ret; } static bool disk_unlock_native_capacity(struct gendisk *disk) { if (!disk->fops->unlock_native_capacity || test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) { printk(KERN_CONT "truncated\n"); return false; } printk(KERN_CONT "enabling native capacity\n"); disk->fops->unlock_native_capacity(disk); return true; } static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; struct block_device *part; if (!size) return true; if (from >= get_capacity(disk)) { printk(KERN_WARNING "%s: p%d start %llu is beyond EOD, ", disk->disk_name, p, (unsigned long long) from); if (disk_unlock_native_capacity(disk)) return false; return true; } if (from + size > get_capacity(disk)) { printk(KERN_WARNING "%s: p%d size %llu extends beyond EOD, ", disk->disk_name, p, (unsigned long long) size); if (disk_unlock_native_capacity(disk)) return false; /* * We can not ignore partitions of broken tables created by for * example camera firmware, but we limit them to the end of the * disk to avoid creating invalid block devices. */ size = get_capacity(disk) - from; } part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part)) { if (PTR_ERR(part) != -ENXIO) { printk(KERN_ERR " %s: p%d could not be added: %pe\n", disk->disk_name, p, part); } return true; } if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) md_autodetect_dev(part->bd_dev); return true; } static int blk_add_partitions(struct gendisk *disk) { struct parsed_partitions *state; int ret = -EAGAIN, p; if (!disk_has_partscan(disk)) return 0; state = check_partition(disk); if (!state) return 0; if (IS_ERR(state)) { /* * I/O error reading the partition table. If we tried to read * beyond EOD, retry after unlocking the native capacity. */ if (PTR_ERR(state) == -ENOSPC) { printk(KERN_WARNING "%s: partition table beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) return -EAGAIN; } return -EIO; } /* * Partitions are not supported on host managed zoned block devices. */ if (bdev_is_zoned(disk->part0)) { pr_warn("%s: ignoring partition table on host managed zoned block device\n", disk->disk_name); ret = 0; goto out_free_state; } /* * If we read beyond EOD, try unlocking native capacity even if the * partition table was successfully read as we could be missing some * partitions. */ if (state->access_beyond_eod) { printk(KERN_WARNING "%s: partition table partially beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) goto out_free_state; } /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); for (p = 1; p < state->limit; p++) if (!blk_add_partition(disk, state, p)) goto out_free_state; ret = 0; out_free_state: free_partitions(state); return ret; } int bdev_disk_changed(struct gendisk *disk, bool invalidate) { struct block_device *part; unsigned long idx; int ret = 0; lockdep_assert_held(&disk->open_mutex); if (!disk_live(disk)) return -ENXIO; rescan: if (disk->open_partitions) return -EBUSY; sync_blockdev(disk->part0); invalidate_bdev(disk->part0); xa_for_each_start(&disk->part_tbl, idx, part, 1) { /* * Remove the block device from the inode hash, so that * it cannot be looked up any more even when openers * still hold references. */ bdev_unhash(part); /* * If @disk->open_partitions isn't elevated but there's * still an active holder of that block device things * are broken. */ WARN_ON_ONCE(atomic_read(&part->bd_openers)); invalidate_bdev(part); drop_partition(part); } clear_bit(GD_NEED_PART_SCAN, &disk->state); /* * Historically we only set the capacity to zero for devices that * support partitions (independ of actually having partitions created). * Doing that is rather inconsistent, but changing it broke legacy * udisks polling for legacy ide-cdrom devices. Use the crude check * below to get the sane behavior for most device while not breaking * userspace for this particular setup. */ if (invalidate) { if (!(disk->flags & GENHD_FL_NO_PART) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); } if (get_capacity(disk)) { ret = blk_add_partitions(disk); if (ret == -EAGAIN) goto rescan; } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); } return ret; } /* * Only exported for loop and dasd for historic reasons. Don't use in new * code! */ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { struct address_space *mapping = state->disk->part0->bd_mapping; struct folio *folio; if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; goto out; } folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL); if (IS_ERR(folio)) goto out; p->v = folio; return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE); out: p->v = NULL; return NULL; }
14 14 14 14 14 14 14 14 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 // SPDX-License-Identifier: GPL-2.0 /* * Neil Brown <neilb@cse.unsw.edu.au> * J. Bruce Fields <bfields@umich.edu> * Andy Adamson <andros@umich.edu> * Dug Song <dugsong@monkey.org> * * RPCSEC_GSS server authentication. * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 * (gssapi) * * The RPCSEC_GSS involves three stages: * 1/ context creation * 2/ data exchange * 3/ context destruction * * Context creation is handled largely by upcalls to user-space. * In particular, GSS_Accept_sec_context is handled by an upcall * Data exchange is handled entirely within the kernel * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. * Context destruction is handled in-kernel * GSS_Delete_sec_context is in-kernel * * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. * The context handle and gss_token are used as a key into the rpcsec_init cache. * The content of this cache includes some of the outputs of GSS_Accept_sec_context, * being major_status, minor_status, context_handle, reply_token. * These are sent back to the client. * Sequence window management is handled by the kernel. The window size if currently * a compile time constant. * * When user-space is happy that a context is established, it places an entry * in the rpcsec_context cache. The key for this cache is the context_handle. * The content includes: * uid/gidlist - for determining access rights * mechanism type * mechanism specific information, such as a key * */ #include <linux/slab.h> #include <linux/types.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/user_namespace.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/gss_krb5.h> #include <trace/events/rpcgss.h> #include "gss_rpc_upcall.h" /* * Unfortunately there isn't a maximum checksum size exported via the * GSS API. Manufacture one based on GSS mechanisms supported by this * implementation. */ #define GSS_MAX_CKSUMSIZE (GSS_KRB5_TOK_HDR_LEN + GSS_KRB5_MAX_CKSUM_LEN) /* * This value may be increased in the future to accommodate other * usage of the scratch buffer. */ #define GSS_SCRATCH_SIZE GSS_MAX_CKSUMSIZE struct gss_svc_data { /* decoded gss client cred: */ struct rpc_gss_wire_cred clcred; u32 gsd_databody_offset; struct rsc *rsci; /* for temporary results */ __be32 gsd_seq_num; u8 gsd_scratch[GSS_SCRATCH_SIZE]; }; /* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests * into replies. * * Key is context handle (\x if empty) and gss_token. * Content is major_status minor_status (integers) context_handle, reply_token. * */ static int netobj_equal(struct xdr_netobj *a, struct xdr_netobj *b) { return a->len == b->len && 0 == memcmp(a->data, b->data, a->len); } #define RSI_HASHBITS 6 #define RSI_HASHMAX (1<<RSI_HASHBITS) struct rsi { struct cache_head h; struct xdr_netobj in_handle, in_token; struct xdr_netobj out_handle, out_token; int major_status, minor_status; struct rcu_head rcu_head; }; static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old); static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item); static void rsi_free(struct rsi *rsii) { kfree(rsii->in_handle.data); kfree(rsii->in_token.data); kfree(rsii->out_handle.data); kfree(rsii->out_token.data); } static void rsi_free_rcu(struct rcu_head *head) { struct rsi *rsii = container_of(head, struct rsi, rcu_head); rsi_free(rsii); kfree(rsii); } static void rsi_put(struct kref *ref) { struct rsi *rsii = container_of(ref, struct rsi, h.ref); call_rcu(&rsii->rcu_head, rsi_free_rcu); } static inline int rsi_hash(struct rsi *item) { return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS) ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS); } static int rsi_match(struct cache_head *a, struct cache_head *b) { struct rsi *item = container_of(a, struct rsi, h); struct rsi *tmp = container_of(b, struct rsi, h); return netobj_equal(&item->in_handle, &tmp->in_handle) && netobj_equal(&item->in_token, &tmp->in_token); } static int dup_to_netobj(struct xdr_netobj *dst, char *src, int len) { dst->len = len; dst->data = (len ? kmemdup(src, len, GFP_KERNEL) : NULL); if (len && !dst->data) return -ENOMEM; return 0; } static inline int dup_netobj(struct xdr_netobj *dst, struct xdr_netobj *src) { return dup_to_netobj(dst, src->data, src->len); } static void rsi_init(struct cache_head *cnew, struct cache_head *citem) { struct rsi *new = container_of(cnew, struct rsi, h); struct rsi *item = container_of(citem, struct rsi, h); new->out_handle.data = NULL; new->out_handle.len = 0; new->out_token.data = NULL; new->out_token.len = 0; new->in_handle.len = item->in_handle.len; item->in_handle.len = 0; new->in_token.len = item->in_token.len; item->in_token.len = 0; new->in_handle.data = item->in_handle.data; item->in_handle.data = NULL; new->in_token.data = item->in_token.data; item->in_token.data = NULL; } static void update_rsi(struct cache_head *cnew, struct cache_head *citem) { struct rsi *new = container_of(cnew, struct rsi, h); struct rsi *item = container_of(citem, struct rsi, h); BUG_ON(new->out_handle.data || new->out_token.data); new->out_handle.len = item->out_handle.len; item->out_handle.len = 0; new->out_token.len = item->out_token.len; item->out_token.len = 0; new->out_handle.data = item->out_handle.data; item->out_handle.data = NULL; new->out_token.data = item->out_token.data; item->out_token.data = NULL; new->major_status = item->major_status; new->minor_status = item->minor_status; } static struct cache_head *rsi_alloc(void) { struct rsi *rsii = kmalloc(sizeof(*rsii), GFP_KERNEL); if (rsii) return &rsii->h; else return NULL; } static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall_timeout(cd, h); } static void rsi_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { struct rsi *rsii = container_of(h, struct rsi, h); qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); (*bpp)[-1] = '\n'; WARN_ONCE(*blen < 0, "RPCSEC/GSS credential too large - please use gssproxy\n"); } static int rsi_parse(struct cache_detail *cd, char *mesg, int mlen) { /* context token expiry major minor context token */ char *buf = mesg; char *ep; int len; struct rsi rsii, *rsip = NULL; time64_t expiry; int status = -EINVAL; memset(&rsii, 0, sizeof(rsii)); /* handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.in_handle, buf, len)) goto out; /* token */ len = qword_get(&mesg, buf, mlen); status = -EINVAL; if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.in_token, buf, len)) goto out; rsip = rsi_lookup(cd, &rsii); if (!rsip) goto out; rsii.h.flags = 0; /* expiry */ status = get_expiry(&mesg, &expiry); if (status) goto out; status = -EINVAL; /* major/minor */ len = qword_get(&mesg, buf, mlen); if (len <= 0) goto out; rsii.major_status = simple_strtoul(buf, &ep, 10); if (*ep) goto out; len = qword_get(&mesg, buf, mlen); if (len <= 0) goto out; rsii.minor_status = simple_strtoul(buf, &ep, 10); if (*ep) goto out; /* out_handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.out_handle, buf, len)) goto out; /* out_token */ len = qword_get(&mesg, buf, mlen); status = -EINVAL; if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsii.out_token, buf, len)) goto out; rsii.h.expiry_time = expiry; rsip = rsi_update(cd, &rsii, rsip); status = 0; out: rsi_free(&rsii); if (rsip) cache_put(&rsip->h, cd); else status = -ENOMEM; return status; } static const struct cache_detail rsi_cache_template = { .owner = THIS_MODULE, .hash_size = RSI_HASHMAX, .name = "auth.rpcsec.init", .cache_put = rsi_put, .cache_upcall = rsi_upcall, .cache_request = rsi_request, .cache_parse = rsi_parse, .match = rsi_match, .init = rsi_init, .update = update_rsi, .alloc = rsi_alloc, }; static struct rsi *rsi_lookup(struct cache_detail *cd, struct rsi *item) { struct cache_head *ch; int hash = rsi_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsi, h); else return NULL; } static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct rsi *old) { struct cache_head *ch; int hash = rsi_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct rsi, h); else return NULL; } /* * The rpcsec_context cache is used to store a context that is * used in data exchange. * The key is a context handle. The content is: * uid, gidlist, mechanism, service-set, mech-specific-data */ #define RSC_HASHBITS 10 #define RSC_HASHMAX (1<<RSC_HASHBITS) #define GSS_SEQ_WIN 128 struct gss_svc_seq_data { /* highest seq number seen so far: */ u32 sd_max; /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of * sd_win is nonzero iff sequence number i has been seen already: */ unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG]; spinlock_t sd_lock; }; struct rsc { struct cache_head h; struct xdr_netobj handle; struct svc_cred cred; struct gss_svc_seq_data seqdata; struct gss_ctx *mechctx; struct rcu_head rcu_head; }; static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old); static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item); static void rsc_free(struct rsc *rsci) { kfree(rsci->handle.data); if (rsci->mechctx) gss_delete_sec_context(&rsci->mechctx); free_svc_cred(&rsci->cred); } static void rsc_free_rcu(struct rcu_head *head) { struct rsc *rsci = container_of(head, struct rsc, rcu_head); kfree(rsci->handle.data); kfree(rsci); } static void rsc_put(struct kref *ref) { struct rsc *rsci = container_of(ref, struct rsc, h.ref); if (rsci->mechctx) gss_delete_sec_context(&rsci->mechctx); free_svc_cred(&rsci->cred); call_rcu(&rsci->rcu_head, rsc_free_rcu); } static inline int rsc_hash(struct rsc *rsci) { return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS); } static int rsc_match(struct cache_head *a, struct cache_head *b) { struct rsc *new = container_of(a, struct rsc, h); struct rsc *tmp = container_of(b, struct rsc, h); return netobj_equal(&new->handle, &tmp->handle); } static void rsc_init(struct cache_head *cnew, struct cache_head *ctmp) { struct rsc *new = container_of(cnew, struct rsc, h); struct rsc *tmp = container_of(ctmp, struct rsc, h); new->handle.len = tmp->handle.len; tmp->handle.len = 0; new->handle.data = tmp->handle.data; tmp->handle.data = NULL; new->mechctx = NULL; init_svc_cred(&new->cred); } static void update_rsc(struct cache_head *cnew, struct cache_head *ctmp) { struct rsc *new = container_of(cnew, struct rsc, h); struct rsc *tmp = container_of(ctmp, struct rsc, h); new->mechctx = tmp->mechctx; tmp->mechctx = NULL; memset(&new->seqdata, 0, sizeof(new->seqdata)); spin_lock_init(&new->seqdata.sd_lock); new->cred = tmp->cred; init_svc_cred(&tmp->cred); } static struct cache_head * rsc_alloc(void) { struct rsc *rsci = kmalloc(sizeof(*rsci), GFP_KERNEL); if (rsci) return &rsci->h; else return NULL; } static int rsc_upcall(struct cache_detail *cd, struct cache_head *h) { return -EINVAL; } static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) { /* contexthandle expiry [ uid gid N <n gids> mechname ...mechdata... ] */ char *buf = mesg; int id; int len, rv; struct rsc rsci, *rscp = NULL; time64_t expiry; int status = -EINVAL; struct gss_api_mech *gm = NULL; memset(&rsci, 0, sizeof(rsci)); /* context handle */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = -ENOMEM; if (dup_to_netobj(&rsci.handle, buf, len)) goto out; rsci.h.flags = 0; /* expiry */ status = get_expiry(&mesg, &expiry); if (status) goto out; status = -EINVAL; rscp = rsc_lookup(cd, &rsci); if (!rscp) goto out; /* uid, or NEGATIVE */ rv = get_int(&mesg, &id); if (rv == -EINVAL) goto out; if (rv == -ENOENT) set_bit(CACHE_NEGATIVE, &rsci.h.flags); else { int N, i; /* * NOTE: we skip uid_valid()/gid_valid() checks here: * instead, * -1 id's are later mapped to the * (export-specific) anonymous id by nfsd_setuser. * * (But supplementary gid's get no such special * treatment so are checked for validity here.) */ /* uid */ rsci.cred.cr_uid = make_kuid(current_user_ns(), id); /* gid */ if (get_int(&mesg, &id)) goto out; rsci.cred.cr_gid = make_kgid(current_user_ns(), id); /* number of additional gid's */ if (get_int(&mesg, &N)) goto out; if (N < 0 || N > NGROUPS_MAX) goto out; status = -ENOMEM; rsci.cred.cr_group_info = groups_alloc(N); if (rsci.cred.cr_group_info == NULL) goto out; /* gid's */ status = -EINVAL; for (i=0; i<N; i++) { kgid_t kgid; if (get_int(&mesg, &id)) goto out; kgid = make_kgid(current_user_ns(), id); if (!gid_valid(kgid)) goto out; rsci.cred.cr_group_info->gid[i] = kgid; } groups_sort(rsci.cred.cr_group_info); /* mech name */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; gm = rsci.cred.cr_gss_mech = gss_mech_get_by_name(buf); status = -EOPNOTSUPP; if (!gm) goto out; status = -EINVAL; /* mech-specific data: */ len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, NULL, GFP_KERNEL); if (status) goto out; /* get client name */ len = qword_get(&mesg, buf, mlen); if (len > 0) { rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL); if (!rsci.cred.cr_principal) { status = -ENOMEM; goto out; } } } rsci.h.expiry_time = expiry; rscp = rsc_update(cd, &rsci, rscp); status = 0; out: rsc_free(&rsci); if (rscp) cache_put(&rscp->h, cd); else status = -ENOMEM; return status; } static const struct cache_detail rsc_cache_template = { .owner = THIS_MODULE, .hash_size = RSC_HASHMAX, .name = "auth.rpcsec.context", .cache_put = rsc_put, .cache_upcall = rsc_upcall, .cache_parse = rsc_parse, .match = rsc_match, .init = rsc_init, .update = update_rsc, .alloc = rsc_alloc, }; static struct rsc *rsc_lookup(struct cache_detail *cd, struct rsc *item) { struct cache_head *ch; int hash = rsc_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct rsc, h); else return NULL; } static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old) { struct cache_head *ch; int hash = rsc_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct rsc, h); else return NULL; } static struct rsc * gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle) { struct rsc rsci; struct rsc *found; memset(&rsci, 0, sizeof(rsci)); if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) return NULL; found = rsc_lookup(cd, &rsci); rsc_free(&rsci); if (!found) return NULL; if (cache_check(cd, &found->h, NULL)) return NULL; return found; } /** * gss_check_seq_num - GSS sequence number window check * @rqstp: RPC Call to use when reporting errors * @rsci: cached GSS context state (updated on return) * @seq_num: sequence number to check * * Implements sequence number algorithm as specified in * RFC 2203, Section 5.3.3.1. "Context Management". * * Return values: * %true: @rqstp's GSS sequence number is inside the window * %false: @rqstp's GSS sequence number is outside the window */ static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci, u32 seq_num) { struct gss_svc_seq_data *sd = &rsci->seqdata; bool result = false; spin_lock(&sd->sd_lock); if (seq_num > sd->sd_max) { if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { memset(sd->sd_win, 0, sizeof(sd->sd_win)); sd->sd_max = seq_num; } else while (sd->sd_max < seq_num) { sd->sd_max++; __clear_bit(sd->sd_max % GSS_SEQ_WIN, sd->sd_win); } __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); goto ok; } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) { goto toolow; } if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) goto alreadyseen; ok: result = true; out: spin_unlock(&sd->sd_lock); return result; toolow: trace_rpcgss_svc_seqno_low(rqstp, seq_num, sd->sd_max - GSS_SEQ_WIN, sd->sd_max); goto out; alreadyseen: trace_rpcgss_svc_seqno_seen(rqstp, seq_num); goto out; } /* * Decode and verify a Call's verifier field. For RPC_AUTH_GSS Calls, * the body of this field contains a variable length checksum. * * GSS-specific auth_stat values are mandated by RFC 2203 Section * 5.3.3.3. */ static int svcauth_gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, __be32 *rpcstart, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct gss_ctx *ctx_id = rsci->mechctx; u32 flavor, maj_stat; struct xdr_buf rpchdr; struct xdr_netobj checksum; struct kvec iov; /* * Compute the checksum of the incoming Call from the * XID field to credential field: */ iov.iov_base = rpcstart; iov.iov_len = (u8 *)xdr->p - (u8 *)rpcstart; xdr_buf_from_iov(&iov, &rpchdr); /* Call's verf field: */ if (xdr_stream_decode_opaque_auth(xdr, &flavor, (void **)&checksum.data, &checksum.len) < 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (flavor != RPC_AUTH_GSS) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (rqstp->rq_deferred) return SVC_OK; maj_stat = gss_verify_mic(ctx_id, &rpchdr, &checksum); if (maj_stat != GSS_S_COMPLETE) { trace_rpcgss_svc_mic(rqstp, maj_stat); rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; return SVC_DENIED; } if (gc->gc_seq > MAXSEQ) { trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq); rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq)) return SVC_DROP; return SVC_OK; } /* * Construct and encode a Reply's verifier field. The verifier's body * field contains a variable-length checksum of the GSS sequence * number. */ static bool svcauth_gss_encode_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) { struct gss_svc_data *gsd = rqstp->rq_auth_data; u32 maj_stat; struct xdr_buf verf_data; struct xdr_netobj checksum; struct kvec iov; gsd->gsd_seq_num = cpu_to_be32(seq); iov.iov_base = &gsd->gsd_seq_num; iov.iov_len = XDR_UNIT; xdr_buf_from_iov(&iov, &verf_data); checksum.data = gsd->gsd_scratch; maj_stat = gss_get_mic(ctx_id, &verf_data, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; return xdr_stream_encode_opaque_auth(&rqstp->rq_res_stream, RPC_AUTH_GSS, checksum.data, checksum.len) > 0; bad_mic: trace_rpcgss_svc_get_mic(rqstp, maj_stat); return false; } struct gss_domain { struct auth_domain h; u32 pseudoflavor; }; static struct auth_domain * find_gss_auth_domain(struct gss_ctx *ctx, u32 svc) { char *name; name = gss_service_to_auth_domain_name(ctx->mech_type, svc); if (!name) return NULL; return auth_domain_find(name); } static struct auth_ops svcauthops_gss; u32 svcauth_gss_flavor(struct auth_domain *dom) { struct gss_domain *gd = container_of(dom, struct gss_domain, h); return gd->pseudoflavor; } EXPORT_SYMBOL_GPL(svcauth_gss_flavor); struct auth_domain * svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) { struct gss_domain *new; struct auth_domain *test; int stat = -ENOMEM; new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) goto out; kref_init(&new->h.ref); new->h.name = kstrdup(name, GFP_KERNEL); if (!new->h.name) goto out_free_dom; new->h.flavour = &svcauthops_gss; new->pseudoflavor = pseudoflavor; test = auth_domain_lookup(name, &new->h); if (test != &new->h) { pr_warn("svc: duplicate registration of gss pseudo flavour %s.\n", name); stat = -EADDRINUSE; auth_domain_put(test); goto out_free_name; } return test; out_free_name: kfree(new->h.name); out_free_dom: kfree(new); out: return ERR_PTR(stat); } EXPORT_SYMBOL_GPL(svcauth_gss_register_pseudoflavor); /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int svcauth_gss_unwrap_integ(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 len, offset, seq_num, maj_stat; struct xdr_buf *buf = xdr->buf; struct xdr_buf databody_integ; struct xdr_netobj checksum; /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) return 0; if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; if (len & 3) goto unwrap_failed; offset = xdr_stream_pos(xdr); if (xdr_buf_subsegment(buf, &databody_integ, offset, len)) goto unwrap_failed; /* * The xdr_stream now points to the @seq_num field. The next * XDR data item is the @arg field, which contains the clear * text RPC program payload. The checksum, which follows the * @arg field, is located and decoded without updating the * xdr_stream. */ offset += len; if (xdr_decode_word(buf, offset, &checksum.len)) goto unwrap_failed; if (checksum.len > sizeof(gsd->gsd_scratch)) goto unwrap_failed; checksum.data = gsd->gsd_scratch; if (read_bytes_from_xdr_buf(buf, offset + XDR_UNIT, checksum.data, checksum.len)) goto unwrap_failed; maj_stat = gss_verify_mic(ctx, &databody_integ, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; /* The received seqno is protected by the checksum. */ if (xdr_stream_decode_u32(xdr, &seq_num) < 0) goto unwrap_failed; if (seq_num != seq) goto bad_seqno; xdr_truncate_decode(xdr, XDR_UNIT + checksum.len); return 0; unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); return -EINVAL; bad_seqno: trace_rpcgss_svc_seqno_bad(rqstp, seq, seq_num); return -EINVAL; bad_mic: trace_rpcgss_svc_mic(rqstp, maj_stat); return -EINVAL; } /* * RFC 2203, Section 5.3.2.3 * * struct rpc_gss_priv_data { * opaque databody_priv<> * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; */ static noinline_for_stack int svcauth_gss_unwrap_priv(struct svc_rqst *rqstp, u32 seq, struct gss_ctx *ctx) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 len, maj_stat, seq_num, offset; struct xdr_buf *buf = xdr->buf; unsigned int saved_len; if (xdr_stream_decode_u32(xdr, &len) < 0) goto unwrap_failed; if (rqstp->rq_deferred) { /* Already decrypted last time through! The sequence number * check at out_seq is unnecessary but harmless: */ goto out_seq; } if (len > xdr_stream_remaining(xdr)) goto unwrap_failed; offset = xdr_stream_pos(xdr); saved_len = buf->len; maj_stat = gss_unwrap(ctx, offset, offset + len, buf); if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; xdr->nwords -= XDR_QUADLEN(saved_len - buf->len); out_seq: /* gss_unwrap() decrypted the sequence number. */ if (xdr_stream_decode_u32(xdr, &seq_num) < 0) goto unwrap_failed; if (seq_num != seq) goto bad_seqno; return 0; unwrap_failed: trace_rpcgss_svc_unwrap_failed(rqstp); return -EINVAL; bad_seqno: trace_rpcgss_svc_seqno_bad(rqstp, seq, seq_num); return -EINVAL; bad_unwrap: trace_rpcgss_svc_unwrap(rqstp, maj_stat); return -EINVAL; } static enum svc_auth_status svcauth_gss_set_client(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; struct rsc *rsci = svcdata->rsci; struct rpc_gss_wire_cred *gc = &svcdata->clcred; int stat; rqstp->rq_auth_stat = rpc_autherr_badcred; /* * A gss export can be specified either by: * export *(sec=krb5,rw) * or by * export gss/krb5(rw) * The latter is deprecated; but for backwards compatibility reasons * the nfsd code will still fall back on trying it if the former * doesn't work; so we try to make both available to nfsd, below. */ rqstp->rq_gssclient = find_gss_auth_domain(rsci->mechctx, gc->gc_svc); if (rqstp->rq_gssclient == NULL) return SVC_DENIED; stat = svcauth_unix_set_client(rqstp); if (stat == SVC_DROP || stat == SVC_CLOSE) return stat; rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } static bool svcauth_gss_proc_init_verf(struct cache_detail *cd, struct svc_rqst *rqstp, struct xdr_netobj *out_handle, int *major_status, u32 seq_num) { struct xdr_stream *xdr = &rqstp->rq_res_stream; struct rsc *rsci; bool rc; if (*major_status != GSS_S_COMPLETE) goto null_verifier; rsci = gss_svc_searchbyctx(cd, out_handle); if (rsci == NULL) { *major_status = GSS_S_NO_CONTEXT; goto null_verifier; } rc = svcauth_gss_encode_verf(rqstp, rsci->mechctx, seq_num); cache_put(&rsci->h, cd); return rc; null_verifier: return xdr_stream_encode_opaque_auth(xdr, RPC_AUTH_NULL, NULL, 0) > 0; } static void gss_free_in_token_pages(struct gssp_in_token *in_token) { int i; i = 0; while (in_token->pages[i]) put_page(in_token->pages[i++]); kfree(in_token->pages); in_token->pages = NULL; } static int gss_read_proxy_verf(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc, struct xdr_netobj *in_handle, struct gssp_in_token *in_token) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; unsigned int length, pgto_offs, pgfrom_offs; int pages, i, pgto, pgfrom; size_t to_offs, from_offs; u32 inlen; if (dup_netobj(in_handle, &gc->gc_ctx)) return SVC_CLOSE; /* * RFC 2203 Section 5.2.2 * * struct rpc_gss_init_arg { * opaque gss_token<>; * }; */ if (xdr_stream_decode_u32(xdr, &inlen) < 0) goto out_denied_free; if (inlen > xdr_stream_remaining(xdr)) goto out_denied_free; pages = DIV_ROUND_UP(inlen, PAGE_SIZE); in_token->pages = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); if (!in_token->pages) goto out_denied_free; in_token->page_base = 0; in_token->page_len = inlen; for (i = 0; i < pages; i++) { in_token->pages[i] = alloc_page(GFP_KERNEL); if (!in_token->pages[i]) { gss_free_in_token_pages(in_token); goto out_denied_free; } } length = min_t(unsigned int, inlen, (char *)xdr->end - (char *)xdr->p); memcpy(page_address(in_token->pages[0]), xdr->p, length); inlen -= length; to_offs = length; from_offs = rqstp->rq_arg.page_base; while (inlen) { pgto = to_offs >> PAGE_SHIFT; pgfrom = from_offs >> PAGE_SHIFT; pgto_offs = to_offs & ~PAGE_MASK; pgfrom_offs = from_offs & ~PAGE_MASK; length = min_t(unsigned int, inlen, min_t(unsigned int, PAGE_SIZE - pgto_offs, PAGE_SIZE - pgfrom_offs)); memcpy(page_address(in_token->pages[pgto]) + pgto_offs, page_address(rqstp->rq_arg.pages[pgfrom]) + pgfrom_offs, length); to_offs += length; from_offs += length; inlen -= length; } return 0; out_denied_free: kfree(in_handle->data); return SVC_DENIED; } /* * RFC 2203, Section 5.2.3.1. * * struct rpc_gss_init_res { * opaque handle<>; * unsigned int gss_major; * unsigned int gss_minor; * unsigned int seq_window; * opaque gss_token<>; * }; */ static bool svcxdr_encode_gss_init_res(struct xdr_stream *xdr, struct xdr_netobj *handle, struct xdr_netobj *gss_token, unsigned int major_status, unsigned int minor_status, u32 seq_num) { if (xdr_stream_encode_opaque(xdr, handle->data, handle->len) < 0) return false; if (xdr_stream_encode_u32(xdr, major_status) < 0) return false; if (xdr_stream_encode_u32(xdr, minor_status) < 0) return false; if (xdr_stream_encode_u32(xdr, seq_num) < 0) return false; if (xdr_stream_encode_opaque(xdr, gss_token->data, gss_token->len) < 0) return false; return true; } /* * Having read the cred already and found we're in the context * initiation case, read the verifier and initiate (or check the results * of) upcalls to userspace for help with context initiation. If * the upcall results are available, write the verifier and result. * Otherwise, drop the request pending an answer to the upcall. */ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct rsi *rsip, rsikey; __be32 *p; u32 len; int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) return SVC_CLOSE; /* * RFC 2203 Section 5.2.2 * * struct rpc_gss_init_arg { * opaque gss_token<>; * }; */ if (xdr_stream_decode_u32(xdr, &len) < 0) { kfree(rsikey.in_handle.data); return SVC_DENIED; } p = xdr_inline_decode(xdr, len); if (!p) { kfree(rsikey.in_handle.data); return SVC_DENIED; } rsikey.in_token.data = kmalloc(len, GFP_KERNEL); if (ZERO_OR_NULL_PTR(rsikey.in_token.data)) { kfree(rsikey.in_handle.data); return SVC_CLOSE; } memcpy(rsikey.in_token.data, p, len); rsikey.in_token.len = len; /* Perform upcall, or find upcall result: */ rsip = rsi_lookup(sn->rsi_cache, &rsikey); rsi_free(&rsikey); if (!rsip) return SVC_CLOSE; if (cache_check(sn->rsi_cache, &rsip->h, &rqstp->rq_chandle) < 0) /* No upcall result: */ return SVC_CLOSE; ret = SVC_CLOSE; if (!svcauth_gss_proc_init_verf(sn->rsc_cache, rqstp, &rsip->out_handle, &rsip->major_status, GSS_SEQ_WIN)) goto out; if (!svcxdr_set_accept_stat(rqstp)) goto out; if (!svcxdr_encode_gss_init_res(&rqstp->rq_res_stream, &rsip->out_handle, &rsip->out_token, rsip->major_status, rsip->minor_status, GSS_SEQ_WIN)) goto out; ret = SVC_COMPLETE; out: cache_put(&rsip->h, sn->rsi_cache); return ret; } static int gss_proxy_save_rsc(struct cache_detail *cd, struct gssp_upcall_data *ud, uint64_t *handle) { struct rsc rsci, *rscp = NULL; static atomic64_t ctxhctr; long long ctxh; struct gss_api_mech *gm = NULL; time64_t expiry; int status; memset(&rsci, 0, sizeof(rsci)); /* context handle */ status = -ENOMEM; /* the handle needs to be just a unique id, * use a static counter */ ctxh = atomic64_inc_return(&ctxhctr); /* make a copy for the caller */ *handle = ctxh; /* make a copy for the rsc cache */ if (dup_to_netobj(&rsci.handle, (char *)handle, sizeof(uint64_t))) goto out; rscp = rsc_lookup(cd, &rsci); if (!rscp) goto out; /* creds */ if (!ud->found_creds) { /* userspace seem buggy, we should always get at least a * mapping to nobody */ goto out; } else { struct timespec64 boot; /* steal creds */ rsci.cred = ud->creds; memset(&ud->creds, 0, sizeof(struct svc_cred)); status = -EOPNOTSUPP; /* get mech handle from OID */ gm = gss_mech_get_by_OID(&ud->mech_oid); if (!gm) goto out; rsci.cred.cr_gss_mech = gm; status = -EINVAL; /* mech-specific data: */ status = gss_import_sec_context(ud->out_handle.data, ud->out_handle.len, gm, &rsci.mechctx, &expiry, GFP_KERNEL); if (status) goto out; getboottime64(&boot); expiry -= boot.tv_sec; } rsci.h.expiry_time = expiry; rscp = rsc_update(cd, &rsci, rscp); status = 0; out: rsc_free(&rsci); if (rscp) cache_put(&rscp->h, cd); else status = -ENOMEM; return status; } static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_netobj cli_handle; struct gssp_upcall_data ud; uint64_t handle; int status; int ret; struct net *net = SVC_NET(rqstp); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); memset(&ud, 0, sizeof(ud)); ret = gss_read_proxy_verf(rqstp, gc, &ud.in_handle, &ud.in_token); if (ret) return ret; ret = SVC_CLOSE; /* Perform synchronous upcall to gss-proxy */ status = gssp_accept_sec_context_upcall(net, &ud); if (status) goto out; trace_rpcgss_svc_accept_upcall(rqstp, ud.major_status, ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: cli_handle = ud.out_handle; break; case GSS_S_COMPLETE: status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle); if (status) goto out; cli_handle.data = (u8 *)&handle; cli_handle.len = sizeof(handle); break; default: goto out; } if (!svcauth_gss_proc_init_verf(sn->rsc_cache, rqstp, &cli_handle, &ud.major_status, GSS_SEQ_WIN)) goto out; if (!svcxdr_set_accept_stat(rqstp)) goto out; if (!svcxdr_encode_gss_init_res(&rqstp->rq_res_stream, &cli_handle, &ud.out_token, ud.major_status, ud.minor_status, GSS_SEQ_WIN)) goto out; ret = SVC_COMPLETE; out: gss_free_in_token_pages(&ud.in_token); gssp_free_upcall_data(&ud); return ret; } /* * Try to set the sn->use_gss_proxy variable to a new value. We only allow * it to be changed if it's currently undefined (-1). If it's any other value * then return -EBUSY unless the type wouldn't have changed anyway. */ static int set_gss_proxy(struct net *net, int type) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret; WARN_ON_ONCE(type != 0 && type != 1); ret = cmpxchg(&sn->use_gss_proxy, -1, type); if (ret != -1 && ret != type) return -EBUSY; return 0; } static bool use_gss_proxy(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); /* If use_gss_proxy is still undefined, then try to disable it */ if (sn->use_gss_proxy == -1) set_gss_proxy(net, 0); return sn->use_gss_proxy; } static noinline_for_stack int svcauth_gss_proc_init(struct svc_rqst *rqstp, struct rpc_gss_wire_cred *gc) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; u32 flavor, len; void *body; /* Call's verf field: */ if (xdr_stream_decode_opaque_auth(xdr, &flavor, &body, &len) < 0) return SVC_GARBAGE; if (flavor != RPC_AUTH_NULL || len != 0) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) { rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } if (!use_gss_proxy(SVC_NET(rqstp))) return svcauth_gss_legacy_init(rqstp, gc); return svcauth_gss_proxy_init(rqstp, gc); } #ifdef CONFIG_PROC_FS static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct net *net = pde_data(file_inode(file)); char tbuf[20]; unsigned long i; int res; if (*ppos || count > sizeof(tbuf)-1) return -EINVAL; if (copy_from_user(tbuf, buf, count)) return -EFAULT; tbuf[count] = 0; res = kstrtoul(tbuf, 0, &i); if (res) return res; if (i != 1) return -EINVAL; res = set_gssp_clnt(net); if (res) return res; res = set_gss_proxy(net, 1); if (res) return res; return count; } static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct net *net = pde_data(file_inode(file)); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; size_t len; snprintf(tbuf, sizeof(tbuf), "%d\n", sn->use_gss_proxy); len = strlen(tbuf); if (p >= len) return 0; len -= p; if (len > count) len = count; if (copy_to_user(buf, (void *)(tbuf+p), len)) return -EFAULT; *ppos += len; return len; } static const struct proc_ops use_gss_proxy_proc_ops = { .proc_open = nonseekable_open, .proc_write = write_gssp, .proc_read = read_gssp, }; static int create_use_gss_proxy_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct proc_dir_entry **p = &sn->use_gssp_proc; sn->use_gss_proxy = -1; *p = proc_create_data("use-gss-proxy", S_IFREG | 0600, sn->proc_net_rpc, &use_gss_proxy_proc_ops, net); if (!*p) return -ENOMEM; init_gssp_clnt(sn); return 0; } static void destroy_use_gss_proxy_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (sn->use_gssp_proc) { remove_proc_entry("use-gss-proxy", sn->proc_net_rpc); clear_gssp_clnt(sn); } } static ssize_t read_gss_krb5_enctypes(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct rpcsec_gss_oid oid = { .len = 9, .data = "\x2a\x86\x48\x86\xf7\x12\x01\x02\x02", }; struct gss_api_mech *mech; ssize_t ret; mech = gss_mech_get_by_OID(&oid); if (!mech) return 0; if (!mech->gm_upcall_enctypes) { gss_mech_put(mech); return 0; } ret = simple_read_from_buffer(buf, count, ppos, mech->gm_upcall_enctypes, strlen(mech->gm_upcall_enctypes)); gss_mech_put(mech); return ret; } static const struct proc_ops gss_krb5_enctypes_proc_ops = { .proc_open = nonseekable_open, .proc_read = read_gss_krb5_enctypes, }; static int create_krb5_enctypes_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); sn->gss_krb5_enctypes = proc_create_data("gss_krb5_enctypes", S_IFREG | 0444, sn->proc_net_rpc, &gss_krb5_enctypes_proc_ops, net); return sn->gss_krb5_enctypes ? 0 : -ENOMEM; } static void destroy_krb5_enctypes_proc_entry(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); if (sn->gss_krb5_enctypes) remove_proc_entry("gss_krb5_enctypes", sn->proc_net_rpc); } #else /* CONFIG_PROC_FS */ static int create_use_gss_proxy_proc_entry(struct net *net) { return 0; } static void destroy_use_gss_proxy_proc_entry(struct net *net) {} static int create_krb5_enctypes_proc_entry(struct net *net) { return 0; } static void destroy_krb5_enctypes_proc_entry(struct net *net) {} #endif /* CONFIG_PROC_FS */ /* * The Call's credential body should contain a struct rpc_gss_cred_t. * * RFC 2203 Section 5 * * struct rpc_gss_cred_t { * union switch (unsigned int version) { * case RPCSEC_GSS_VERS_1: * struct { * rpc_gss_proc_t gss_proc; * unsigned int seq_num; * rpc_gss_service_t service; * opaque handle<>; * } rpc_gss_cred_vers_1_t; * } * }; */ static bool svcauth_gss_decode_credbody(struct xdr_stream *xdr, struct rpc_gss_wire_cred *gc, __be32 **rpcstart) { ssize_t handle_len; u32 body_len; __be32 *p; p = xdr_inline_decode(xdr, XDR_UNIT); if (!p) return false; /* * start of rpc packet is 7 u32's back from here: * xid direction rpcversion prog vers proc flavour */ *rpcstart = p - 7; body_len = be32_to_cpup(p); if (body_len > RPC_MAX_AUTH_SIZE) return false; /* struct rpc_gss_cred_t */ if (xdr_stream_decode_u32(xdr, &gc->gc_v) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_proc) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_seq) < 0) return false; if (xdr_stream_decode_u32(xdr, &gc->gc_svc) < 0) return false; handle_len = xdr_stream_decode_opaque_inline(xdr, (void **)&gc->gc_ctx.data, body_len); if (handle_len < 0) return false; if (body_len != XDR_UNIT * 5 + xdr_align_size(handle_len)) return false; gc->gc_ctx.len = handle_len; return true; } /** * svcauth_gss_accept - Decode and validate incoming RPC_AUTH_GSS credential * @rqstp: RPC transaction * * Return values: * %SVC_OK: Success * %SVC_COMPLETE: GSS context lifetime event * %SVC_DENIED: Credential or verifier is not valid * %SVC_GARBAGE: Failed to decode credential or verifier * %SVC_CLOSE: Temporary failure * * The rqstp->rq_auth_stat field is also set (see RFCs 2203 and 5531). */ static enum svc_auth_status svcauth_gss_accept(struct svc_rqst *rqstp) { struct gss_svc_data *svcdata = rqstp->rq_auth_data; __be32 *rpcstart; struct rpc_gss_wire_cred *gc; struct rsc *rsci = NULL; int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); rqstp->rq_auth_stat = rpc_autherr_badcred; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; svcdata->gsd_databody_offset = 0; svcdata->rsci = NULL; gc = &svcdata->clcred; if (!svcauth_gss_decode_credbody(&rqstp->rq_arg_stream, gc, &rpcstart)) goto auth_err; if (gc->gc_v != RPC_GSS_VERSION) goto auth_err; switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: if (rqstp->rq_proc != 0) goto auth_err; return svcauth_gss_proc_init(rqstp, gc); case RPC_GSS_PROC_DESTROY: if (rqstp->rq_proc != 0) goto auth_err; fallthrough; case RPC_GSS_PROC_DATA: rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx); if (!rsci) goto auth_err; switch (svcauth_gss_verify_header(rqstp, rsci, rpcstart, gc)) { case SVC_OK: break; case SVC_DENIED: goto auth_err; case SVC_DROP: goto drop; } break; default: if (rqstp->rq_proc != 0) goto auth_err; rqstp->rq_auth_stat = rpc_autherr_rejectedcred; goto auth_err; } /* now act upon the command: */ switch (gc->gc_proc) { case RPC_GSS_PROC_DESTROY: if (!svcauth_gss_encode_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; if (!svcxdr_set_accept_stat(rqstp)) goto auth_err; /* Delete the entry from the cache_list and call cache_put */ sunrpc_cache_unhash(sn->rsc_cache, &rsci->h); goto complete; case RPC_GSS_PROC_DATA: rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; if (!svcauth_gss_encode_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; if (!svcxdr_set_accept_stat(rqstp)) goto auth_err; svcdata->gsd_databody_offset = xdr_stream_pos(&rqstp->rq_res_stream); rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); rqstp->rq_auth_stat = rpc_autherr_badcred; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; case RPC_GSS_SVC_INTEGRITY: /* placeholders for body length and seq. number: */ xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2); if (svcauth_gss_unwrap_integ(rqstp, gc->gc_seq, rsci->mechctx)) goto garbage_args; svcxdr_set_auth_slack(rqstp, RPC_MAX_AUTH_SIZE); break; case RPC_GSS_SVC_PRIVACY: /* placeholders for body length and seq. number: */ xdr_reserve_space(&rqstp->rq_res_stream, XDR_UNIT * 2); if (svcauth_gss_unwrap_priv(rqstp, gc->gc_seq, rsci->mechctx)) goto garbage_args; svcxdr_set_auth_slack(rqstp, RPC_MAX_AUTH_SIZE * 2); break; default: goto auth_err; } svcdata->rsci = rsci; cache_get(&rsci->h); rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor( rsci->mechctx->mech_type, GSS_C_QOP_DEFAULT, gc->gc_svc); ret = SVC_OK; trace_rpcgss_svc_authenticate(rqstp, gc); goto out; } garbage_args: ret = SVC_GARBAGE; goto out; auth_err: xdr_truncate_encode(&rqstp->rq_res_stream, XDR_UNIT * 2); ret = SVC_DENIED; goto out; complete: ret = SVC_COMPLETE; goto out; drop: ret = SVC_CLOSE; out: if (rsci) cache_put(&rsci->h, sn->rsc_cache); return ret; } static u32 svcauth_gss_prepare_to_wrap(struct svc_rqst *rqstp, struct gss_svc_data *gsd) { u32 offset; /* Release can be called twice, but we only wrap once. */ offset = gsd->gsd_databody_offset; gsd->gsd_databody_offset = 0; /* AUTH_ERROR replies are not wrapped. */ if (rqstp->rq_auth_stat != rpc_auth_ok) return 0; /* Also don't wrap if the accept_stat is nonzero: */ if (*rqstp->rq_accept_statp != rpc_success) return 0; return offset; } /* * RFC 2203, Section 5.3.2.2 * * struct rpc_gss_integ_data { * opaque databody_integ<>; * opaque checksum<>; * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; * * The RPC Reply message has already been XDR-encoded. rq_res_stream * is now positioned so that the checksum can be written just past * the RPC Reply message. */ static int svcauth_gss_wrap_integ(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct xdr_stream *xdr = &rqstp->rq_res_stream; struct rpc_gss_wire_cred *gc = &gsd->clcred; struct xdr_buf *buf = xdr->buf; struct xdr_buf databody_integ; struct xdr_netobj checksum; u32 offset, maj_stat; offset = svcauth_gss_prepare_to_wrap(rqstp, gsd); if (!offset) goto out; if (xdr_buf_subsegment(buf, &databody_integ, offset + XDR_UNIT, buf->len - offset - XDR_UNIT)) goto wrap_failed; /* Buffer space for these has already been reserved in * svcauth_gss_accept(). */ if (xdr_encode_word(buf, offset, databody_integ.len)) goto wrap_failed; if (xdr_encode_word(buf, offset + XDR_UNIT, gc->gc_seq)) goto wrap_failed; checksum.data = gsd->gsd_scratch; maj_stat = gss_get_mic(gsd->rsci->mechctx, &databody_integ, &checksum); if (maj_stat != GSS_S_COMPLETE) goto bad_mic; if (xdr_stream_encode_opaque(xdr, checksum.data, checksum.len) < 0) goto wrap_failed; xdr_commit_encode(xdr); out: return 0; bad_mic: trace_rpcgss_svc_get_mic(rqstp, maj_stat); return -EINVAL; wrap_failed: trace_rpcgss_svc_wrap_failed(rqstp); return -EINVAL; } /* * RFC 2203, Section 5.3.2.3 * * struct rpc_gss_priv_data { * opaque databody_priv<> * }; * * struct rpc_gss_data_t { * unsigned int seq_num; * proc_req_arg_t arg; * }; * * gss_wrap() expands the size of the RPC message payload in the * response buffer. The main purpose of svcauth_gss_wrap_priv() * is to ensure there is adequate space in the response buffer to * avoid overflow during the wrap. */ static int svcauth_gss_wrap_priv(struct svc_rqst *rqstp) { struct gss_svc_data *gsd = rqstp->rq_auth_data; struct rpc_gss_wire_cred *gc = &gsd->clcred; struct xdr_buf *buf = &rqstp->rq_res; struct kvec *head = buf->head; struct kvec *tail = buf->tail; u32 offset, pad, maj_stat; __be32 *p; offset = svcauth_gss_prepare_to_wrap(rqstp, gsd); if (!offset) return 0; /* * Buffer space for this field has already been reserved * in svcauth_gss_accept(). Note that the GSS sequence * number is encrypted along with the RPC reply payload. */ if (xdr_encode_word(buf, offset + XDR_UNIT, gc->gc_seq)) goto wrap_failed; /* * If there is currently tail data, make sure there is * room for the head, tail, and 2 * RPC_MAX_AUTH_SIZE in * the page, and move the current tail data such that * there is RPC_MAX_AUTH_SIZE slack space available in * both the head and tail. */ if (tail->iov_base) { if (tail->iov_base >= head->iov_base + PAGE_SIZE) goto wrap_failed; if (tail->iov_base < head->iov_base) goto wrap_failed; if (tail->iov_len + head->iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) goto wrap_failed; memmove(tail->iov_base + RPC_MAX_AUTH_SIZE, tail->iov_base, tail->iov_len); tail->iov_base += RPC_MAX_AUTH_SIZE; } /* * If there is no current tail data, make sure there is * room for the head data, and 2 * RPC_MAX_AUTH_SIZE in the * allotted page, and set up tail information such that there * is RPC_MAX_AUTH_SIZE slack space available in both the * head and tail. */ if (!tail->iov_base) { if (head->iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) goto wrap_failed; tail->iov_base = head->iov_base + head->iov_len + RPC_MAX_AUTH_SIZE; tail->iov_len = 0; } maj_stat = gss_wrap(gsd->rsci->mechctx, offset + XDR_UNIT, buf, buf->pages); if (maj_stat != GSS_S_COMPLETE) goto bad_wrap; /* Wrapping can change the size of databody_priv. */ if (xdr_encode_word(buf, offset, buf->len - offset - XDR_UNIT)) goto wrap_failed; pad = xdr_pad_size(buf->len - offset - XDR_UNIT); p = (__be32 *)(tail->iov_base + tail->iov_len); memset(p, 0, pad); tail->iov_len += pad; buf->len += pad; return 0; wrap_failed: trace_rpcgss_svc_wrap_failed(rqstp); return -EINVAL; bad_wrap: trace_rpcgss_svc_wrap(rqstp, maj_stat); return -ENOMEM; } /** * svcauth_gss_release - Wrap payload and release resources * @rqstp: RPC transaction context * * Return values: * %0: the Reply is ready to be sent * %-ENOMEM: failed to allocate memory * %-EINVAL: encoding error */ static int svcauth_gss_release(struct svc_rqst *rqstp) { struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); struct gss_svc_data *gsd = rqstp->rq_auth_data; struct rpc_gss_wire_cred *gc; int stat; if (!gsd) goto out; gc = &gsd->clcred; if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; case RPC_GSS_SVC_INTEGRITY: stat = svcauth_gss_wrap_integ(rqstp); if (stat) goto out_err; break; case RPC_GSS_SVC_PRIVACY: stat = svcauth_gss_wrap_priv(rqstp); if (stat) goto out_err; break; /* * For any other gc_svc value, svcauth_gss_accept() already set * the auth_error appropriately; just fall through: */ } out: stat = 0; out_err: if (rqstp->rq_client) auth_domain_put(rqstp->rq_client); rqstp->rq_client = NULL; if (rqstp->rq_gssclient) auth_domain_put(rqstp->rq_gssclient); rqstp->rq_gssclient = NULL; if (rqstp->rq_cred.cr_group_info) put_group_info(rqstp->rq_cred.cr_group_info); rqstp->rq_cred.cr_group_info = NULL; if (gsd && gsd->rsci) { cache_put(&gsd->rsci->h, sn->rsc_cache); gsd->rsci = NULL; } return stat; } static void svcauth_gss_domain_release_rcu(struct rcu_head *head) { struct auth_domain *dom = container_of(head, struct auth_domain, rcu_head); struct gss_domain *gd = container_of(dom, struct gss_domain, h); kfree(dom->name); kfree(gd); } static void svcauth_gss_domain_release(struct auth_domain *dom) { call_rcu(&dom->rcu_head, svcauth_gss_domain_release_rcu); } static rpc_authflavor_t svcauth_gss_pseudoflavor(struct svc_rqst *rqstp) { return svcauth_gss_flavor(rqstp->rq_gssclient); } static struct auth_ops svcauthops_gss = { .name = "rpcsec_gss", .owner = THIS_MODULE, .flavour = RPC_AUTH_GSS, .accept = svcauth_gss_accept, .release = svcauth_gss_release, .domain_release = svcauth_gss_domain_release, .set_client = svcauth_gss_set_client, .pseudoflavor = svcauth_gss_pseudoflavor, }; static int rsi_cache_create_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd; int err; cd = cache_create_net(&rsi_cache_template, net); if (IS_ERR(cd)) return PTR_ERR(cd); err = cache_register_net(cd, net); if (err) { cache_destroy_net(cd, net); return err; } sn->rsi_cache = cd; return 0; } static void rsi_cache_destroy_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd = sn->rsi_cache; sn->rsi_cache = NULL; cache_purge(cd); cache_unregister_net(cd, net); cache_destroy_net(cd, net); } static int rsc_cache_create_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd; int err; cd = cache_create_net(&rsc_cache_template, net); if (IS_ERR(cd)) return PTR_ERR(cd); err = cache_register_net(cd, net); if (err) { cache_destroy_net(cd, net); return err; } sn->rsc_cache = cd; return 0; } static void rsc_cache_destroy_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct cache_detail *cd = sn->rsc_cache; sn->rsc_cache = NULL; cache_purge(cd); cache_unregister_net(cd, net); cache_destroy_net(cd, net); } int gss_svc_init_net(struct net *net) { int rv; rv = rsc_cache_create_net(net); if (rv) return rv; rv = rsi_cache_create_net(net); if (rv) goto out1; rv = create_use_gss_proxy_proc_entry(net); if (rv) goto out2; rv = create_krb5_enctypes_proc_entry(net); if (rv) goto out3; return 0; out3: destroy_use_gss_proxy_proc_entry(net); out2: rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; } void gss_svc_shutdown_net(struct net *net) { destroy_krb5_enctypes_proc_entry(net); destroy_use_gss_proxy_proc_entry(net); rsi_cache_destroy_net(net); rsc_cache_destroy_net(net); } int gss_svc_init(void) { return svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); } void gss_svc_shutdown(void) { svc_auth_unregister(RPC_AUTH_GSS); }
240 124 3 6 162 95 165 162 165 163 163 163 66 66 13 9 96 70 96 200 163 63 63 63 68 1 6 42 5 5 66 68 70 66 66 66 66 18 75 32 18 236 25 149 106 233 238 230 199 128 221 228 2 1 84 193 6 83 6 82 163 143 37 237 82 240 139 234 153 243 153 130 192 96 96 95 2 198 96 173 139 96 201 201 203 197 197 163 197 197 197 197 197 3 197 202 194 202 160 160 198 200 195 176 16 4 16 67 57 7 35 39 38 48 50 47 12 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H #include <linux/unaligned.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/align.h> #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/string.h> #include <linux/mm.h> #include <uapi/linux/btrfs_tree.h> struct extent_buffer; struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; unsigned long offset; }; void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); /* * Some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple one * for u8: */ #define le8_to_cpu(v) (v) #define cpu_to_le8(v) (v) #define __le8 u8 static inline u8 get_unaligned_le8(const void *p) { return *(const u8 *)p; } static inline void put_unaligned_le8(u8 val, void *p) { *(u8 *)p = val; } #define read_eb_member(eb, ptr, type, member, result) (\ read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ sizeof_field(type, member))) #define write_eb_member(eb, ptr, type, member, source) ( \ write_extent_buffer(eb, (const char *)(source), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ sizeof_field(type, member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off); \ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off, \ u##bits val); \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off); \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val); DECLARE_BTRFS_SETGET_BITS(8) DECLARE_BTRFS_SETGET_BITS(16) DECLARE_BTRFS_SETGET_BITS(32) DECLARE_BTRFS_SETGET_BITS(64) #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ const type *p = folio_address(eb->folios[0]) + \ offset_in_page(eb->start); \ return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \ put_unaligned_le##bits(val, &p->member); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const type *s) \ { \ return get_unaligned_le##bits(&s->member); \ } \ static inline void btrfs_set_##name(type *s, u##bits val) \ { \ put_unaligned_le##bits(val, &s->member); \ } static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64); BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, seek_speed, 8); BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, bandwidth, 8); BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, generation, 64); static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) { return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); } static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) { return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); } BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) { return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); } BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, sector_size, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr) { unsigned long offset = (unsigned long)c; offset += offsetof(struct btrfs_chunk, stripe); offset += nr * sizeof(struct btrfs_stripe); return (struct btrfs_stripe *)offset; } static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) { return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); } static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); } static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, struct btrfs_chunk *c, int nr, u64 val) { btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); } static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); } static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, struct btrfs_chunk *c, int nr, u64 val) { btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); } /* struct btrfs_block_group_item */ BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, used, 64); BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); BTRFS_SETGET_FUNCS(block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, struct btrfs_block_group_item, flags, 64); /* struct btrfs_free_space_info */ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, extent_count, 32); BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64); /* struct btrfs_inode_extref */ BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, parent_objectid, 64); BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, name_len, 16); BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); /* struct btrfs_inode_item */ BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, sequence, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, transid, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, block_group, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64); BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64); BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64); /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, chunk_objectid, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent, chunk_objectid, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64); BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); static inline void btrfs_tree_block_key(const struct extent_buffer *eb, struct btrfs_tree_block_info *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, struct btrfs_tree_block_info *item, const struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64); BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, objectid, 64); BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, offset, 64); BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32); BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref, root_id, 64); BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, type, 8); BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, offset, 64); static inline u32 btrfs_extent_inline_ref_size(int type) { if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY) return sizeof(struct btrfs_extent_inline_ref); if (type == BTRFS_SHARED_DATA_REF_KEY) return sizeof(struct btrfs_shared_data_ref) + sizeof(struct btrfs_extent_inline_ref); if (type == BTRFS_EXTENT_DATA_REF_KEY) return sizeof(struct btrfs_extent_data_ref) + offsetof(struct btrfs_extent_inline_ref, offset); if (type == BTRFS_EXTENT_OWNER_REF_KEY) return sizeof(struct btrfs_extent_inline_ref); return 0; } /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, generation, 64); static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); } static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, int nr, u64 val) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); } static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); } static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, int nr, u64 val) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); } static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr) { return offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; } void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr); static inline void btrfs_set_node_key(const struct extent_buffer *eb, const struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr; ptr = btrfs_node_key_ptr_offset(eb, nr); write_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } /* struct btrfs_item */ BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr) { return offsetof(struct btrfs_leaf, items) + sizeof(struct btrfs_item) * nr; } static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr) { return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr); } #define BTRFS_ITEM_SETGET_FUNCS(member) \ static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \ { \ return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot)); \ } \ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ int slot, u32 val) \ { \ btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \ } \ static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ int slot) \ { \ struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ return btrfs_token_raw_item_##member(token, item); \ } \ static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ int slot, u32 val) \ { \ struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ btrfs_set_token_raw_item_##member(token, item, val); \ } BTRFS_ITEM_SETGET_FUNCS(offset) BTRFS_ITEM_SETGET_FUNCS(size); static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) { return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); } static inline void btrfs_item_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(eb, nr); read_eb_member(eb, item, struct btrfs_item, key, disk_key); } static inline void btrfs_set_item_key(struct extent_buffer *eb, const struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(eb, nr); write_eb_member(eb, item, struct btrfs_item, key, disk_key); } BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); /* struct btrfs_root_ref */ BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64); BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64); BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16); /* struct btrfs_dir_item */ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb, const struct btrfs_dir_item *item) { return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item)); } static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item) { return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item)); } static inline void btrfs_dir_item_key(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_dir_item, location, key); } static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, struct btrfs_dir_item *item, const struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_dir_item, location, key); } BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, num_entries, 64); BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, num_bitmaps, 64); BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, generation, 64); static inline void btrfs_free_space_key(const struct extent_buffer *eb, const struct btrfs_free_space_header *h, struct btrfs_disk_key *key) { read_eb_member(eb, h, struct btrfs_free_space_header, location, key); } static inline void btrfs_set_free_space_key(struct extent_buffer *eb, struct btrfs_free_space_header *h, const struct btrfs_disk_key *key) { write_eb_member(eb, h, struct btrfs_free_space_header, location, key); } /* struct btrfs_disk_key */ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); #ifdef __LITTLE_ENDIAN /* * Optimized helpers for little-endian architectures where CPU and on-disk * structures have the same endianness and we can skip conversions. */ static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, const struct btrfs_disk_key *disk_key) { memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); } static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, const struct btrfs_key *cpu_key) { memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); } static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *cpu_key, int nr) { struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; btrfs_node_key(eb, disk_key, nr); } static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *cpu_key, int nr) { struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; btrfs_item_key(eb, disk_key, nr); } static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_key *cpu_key) { struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; btrfs_dir_item_key(eb, item, disk_key); } #else static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, const struct btrfs_disk_key *disk) { cpu->offset = le64_to_cpu(disk->offset); cpu->type = disk->type; cpu->objectid = le64_to_cpu(disk->objectid); } static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, const struct btrfs_key *cpu) { disk->offset = cpu_to_le64(cpu->offset); disk->type = cpu->type; disk->objectid = cpu_to_le64(cpu->objectid); } static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_node_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_item_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_key *key) { struct btrfs_disk_key disk_key; btrfs_dir_item_key(eb, item, &disk_key); btrfs_disk_key_to_cpu(key, &disk_key); } #endif /* struct btrfs_header */ BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64); BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) { return (btrfs_header_flags(eb) & flag) == flag; } static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags | flag); } static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags & ~flag); } static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) { u64 flags = btrfs_header_flags(eb); return flags >> BTRFS_BACKREF_REV_SHIFT; } static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev) { u64 flags = btrfs_header_flags(eb); flags &= ~BTRFS_BACKREF_REV_MASK; flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; btrfs_set_header_flags(eb, flags); } static inline int btrfs_is_leaf(const struct extent_buffer *eb) { return btrfs_header_level(eb) == 0; } /* struct btrfs_root_item */ BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64); BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, generation_v2, 64); BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64); BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, tree_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, tree_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, chunk_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, chunk_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, extent_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, extent_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, extent_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, fs_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, fs_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, fs_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, dev_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, dev_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, dev_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, csum_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, csum_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, csum_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); static inline void btrfs_balance_data(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } static inline void btrfs_set_balance_data(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } static inline void btrfs_balance_meta(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } static inline void btrfs_set_balance_meta(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } static inline void btrfs_balance_sys(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } static inline void btrfs_set_balance_sys(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, generation, 64); BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, struct btrfs_super_block, sys_chunk_array_size, 32); BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, struct btrfs_super_block, chunk_root_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, chunk_root, 64); BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, log_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, sectorsize, 32); BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, nodesize, 32); BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, stripesize, 32); BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, root_dir_objectid, 64); BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, num_devices, 64); BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, compat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, compat_ro_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, incompat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, csum_type, 16); BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, cache_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, nr_global_roots, 64); /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, struct btrfs_file_extent_item, offset, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, struct btrfs_file_extent_item, ram_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, struct btrfs_file_extent_item, disk_num_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, struct btrfs_file_extent_item, compression, 8); BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, disk_num_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, offset, 64); BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, ram_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, compression, 8); BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, encryption, 8); BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, other_encoding, 16); /* btrfs_qgroup_status_item */ BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, generation, 64); BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, version, 64); BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, rescan, 64); BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item, enable_gen, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, generation, 64); BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, rfer_cmpr, 64); BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, excl_cmpr, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, struct btrfs_qgroup_info_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, rfer_cmpr, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, excl_cmpr, 64); /* btrfs_qgroup_limit_item */ BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, max_rfer, 64); BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, max_excl, 64); BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, rsv_rfer, 64); BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, max_rfer, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, max_excl, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, rsv_rfer, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); /* btrfs_dev_replace_item */ BTRFS_SETGET_FUNCS(dev_replace_src_devid, struct btrfs_dev_replace_item, src_devid, 64); BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, 64); BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, replace_state, 64); BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, time_started, 64); BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, time_stopped, 64); BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, num_write_errors, 64); BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, struct btrfs_dev_replace_item, num_uncorrectable_read_errors, 64); BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, cursor_left, 64); BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, struct btrfs_dev_replace_item, src_devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, struct btrfs_dev_replace_item, replace_state, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, struct btrfs_dev_replace_item, time_started, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, struct btrfs_dev_replace_item, time_stopped, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, struct btrfs_dev_replace_item, num_write_errors, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, struct btrfs_dev_replace_item, num_uncorrectable_read_errors, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, struct btrfs_dev_replace_item, cursor_left, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); /* btrfs_verity_descriptor_item */ BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, encryption, 8); BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, size, 64); BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, struct btrfs_verity_descriptor_item, encryption, 8); BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, struct btrfs_verity_descriptor_item, size, 64); /* Cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ ((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) #endif
289 288 289 52 52 52 128 2 5619 6458 5680 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MMU_CONTEXT_H #define _ASM_X86_MMU_CONTEXT_H #include <asm/desc.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/pkeys.h> #include <trace/events/tlb.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/debugreg.h> #include <asm/gsseg.h> extern atomic64_t last_mm_ctx_id; #ifdef CONFIG_PERF_EVENTS DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key); DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key); void cr4_update_pce(void *ignored); #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL /* * ldt_structs can be allocated, used, and freed, but they are never * modified while live. */ struct ldt_struct { /* * Xen requires page-aligned LDTs with special permissions. This is * needed to prevent us from installing evil descriptors such as * call gates. On native, we could merge the ldt_struct and LDT * allocations, but it's not worth trying to optimize. */ struct desc_struct *entries; unsigned int nr_entries; /* * If PTI is in use, then the entries array is not mapped while we're * in user mode. The whole array will be aliased at the addressed * given by ldt_slot_va(slot). We use two slots so that we can allocate * and map, and enable a new LDT without invalidating the mapping * of an older, still-in-use LDT. * * slot will be -1 if this LDT doesn't have an alias mapping. */ int slot; }; /* * Used for LDT copy/destruction. */ static inline void init_new_context_ldt(struct mm_struct *mm) { mm->context.ldt = NULL; init_rwsem(&mm->context.ldt_usr_sem); } int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm); void ldt_arch_exit_mmap(struct mm_struct *mm); #else /* CONFIG_MODIFY_LDT_SYSCALL */ static inline void init_new_context_ldt(struct mm_struct *mm) { } static inline int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm) { return 0; } static inline void destroy_context_ldt(struct mm_struct *mm) { } static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL extern void load_mm_ldt(struct mm_struct *mm); extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next); #else static inline void load_mm_ldt(struct mm_struct *mm) { clear_LDT(); } static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) { DEBUG_LOCKS_WARN_ON(preemptible()); } #endif #ifdef CONFIG_ADDRESS_MASKING static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) { /* * When switch_mm_irqs_off() is called for a kthread, it may race with * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it * reads a single value for both. */ return READ_ONCE(mm->context.lam_cr3_mask); } static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) { mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask; mm->context.untag_mask = oldmm->context.untag_mask; } #define mm_untag_mask mm_untag_mask static inline unsigned long mm_untag_mask(struct mm_struct *mm) { return mm->context.untag_mask; } static inline void mm_reset_untag_mask(struct mm_struct *mm) { mm->context.untag_mask = -1UL; } #define arch_pgtable_dma_compat arch_pgtable_dma_compat static inline bool arch_pgtable_dma_compat(struct mm_struct *mm) { return !mm_lam_cr3_mask(mm) || test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags); } #else static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) { return 0; } static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) { } static inline void mm_reset_untag_mask(struct mm_struct *mm) { } #endif #define enter_lazy_tlb enter_lazy_tlb extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); /* * Init a new mm. Used on mm copies, like at fork() * and on mm's that are brand-new, like at execve(). */ #define init_new_context init_new_context static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { mutex_init(&mm->context.lock); mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); atomic64_set(&mm->context.tlb_gen, 0); #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and allocated implicitly */ mm->context.pkey_allocation_map = 0x1; /* -1 means unallocated or invalid */ mm->context.execute_only_pkey = -1; } #endif mm_reset_untag_mask(mm); init_new_context_ldt(mm); return 0; } #define destroy_context destroy_context static inline void destroy_context(struct mm_struct *mm) { destroy_context_ldt(mm); } extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); #define switch_mm_irqs_off switch_mm_irqs_off #define activate_mm(prev, next) \ do { \ paravirt_enter_mmap(next); \ switch_mm((prev), (next), NULL); \ } while (0); #ifdef CONFIG_X86_32 #define deactivate_mm(tsk, mm) \ do { \ loadsegment(gs, 0); \ } while (0) #else #define deactivate_mm(tsk, mm) \ do { \ shstk_free(tsk); \ load_gs_index(0); \ loadsegment(fs, 0); \ } while (0) #endif static inline void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) { #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; /* Duplicate the oldmm pkey state in mm: */ mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; mm->context.execute_only_pkey = oldmm->context.execute_only_pkey; #endif } static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { arch_dup_pkeys(oldmm, mm); paravirt_enter_mmap(mm); dup_lam(oldmm, mm); return ldt_dup_context(oldmm, mm); } static inline void arch_exit_mmap(struct mm_struct *mm) { paravirt_arch_exit_mmap(mm); ldt_arch_exit_mmap(mm); } #ifdef CONFIG_X86_64 static inline bool is_64bit_mm(struct mm_struct *mm) { return !IS_ENABLED(CONFIG_IA32_EMULATION) || !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags); } #else static inline bool is_64bit_mm(struct mm_struct *mm) { return false; } #endif /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other * processes or any way to tell *which * PKRU in a threaded * process we could use. * * So do not enforce things if the VMA is not from the current * mm, or if we are in a kernel thread. */ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign) { /* pkeys never affect instruction fetches */ if (execute) return true; /* allow access if the VMA is not one from this process */ if (foreign || vma_is_foreign(vma)) return true; return __pkru_allows_pkey(vma_pkey(vma), write); } unsigned long __get_current_cr3_fast(void); #include <asm-generic/mmu_context.h> #endif /* _ASM_X86_MMU_CONTEXT_H */
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 // SPDX-License-Identifier: GPL-2.0 /* -*- linux-c -*- * Cypress USB Thermometer driver * * Copyright (c) 2004 Erik Rigtorp <erkki@linux.nu> <erik@rigtorp.com> * * This driver works with Elektor magazine USB Interface as published in * issue #291. It should also work with the original starter kit/demo board * from Cypress. */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/usb.h> #define DRIVER_AUTHOR "Erik Rigtorp" #define DRIVER_DESC "Cypress USB Thermometer driver" #define USB_SKEL_VENDOR_ID 0x04b4 #define USB_SKEL_PRODUCT_ID 0x0002 static const struct usb_device_id id_table[] = { { USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) }, { } }; MODULE_DEVICE_TABLE (usb, id_table); /* Structure to hold all of our device specific stuff */ struct usb_cytherm { struct usb_device *udev; /* save off the usb device pointer */ struct usb_interface *interface; /* the interface for this device */ int brightness; }; /* Vendor requests */ /* They all operate on one byte at a time */ #define PING 0x00 #define READ_ROM 0x01 /* Reads form ROM, value = address */ #define READ_RAM 0x02 /* Reads form RAM, value = address */ #define WRITE_RAM 0x03 /* Write to RAM, value = address, index = data */ #define READ_PORT 0x04 /* Reads from port, value = address */ #define WRITE_PORT 0x05 /* Write to port, value = address, index = data */ /* Send a vendor command to device */ static int vendor_command(struct usb_device *dev, unsigned char request, unsigned char value, unsigned char index, void *buf, int size) { return usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), request, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_OTHER, value, index, buf, size, USB_CTRL_GET_TIMEOUT); } #define BRIGHTNESS 0x2c /* RAM location for brightness value */ #define BRIGHTNESS_SEM 0x2b /* RAM location for brightness semaphore */ static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct usb_cytherm *cytherm = usb_get_intfdata(intf); return sprintf(buf, "%i", cytherm->brightness); } static ssize_t brightness_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_interface *intf = to_usb_interface(dev); struct usb_cytherm *cytherm = usb_get_intfdata(intf); unsigned char *buffer; int retval; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return 0; cytherm->brightness = simple_strtoul(buf, NULL, 10); if (cytherm->brightness > 0xFF) cytherm->brightness = 0xFF; else if (cytherm->brightness < 0) cytherm->brightness = 0; /* Set brightness */ retval = vendor_command(cytherm->udev, WRITE_RAM, BRIGHTNESS, cytherm->brightness, buffer, 8); if (retval) dev_dbg(&cytherm->udev->dev, "retval = %d\n", retval); /* Inform µC that we have changed the brightness setting */ retval = vendor_command(cytherm->udev, WRITE_RAM, BRIGHTNESS_SEM, 0x01, buffer, 8); if (retval) dev_dbg(&cytherm->udev->dev, "retval = %d\n", retval); kfree(buffer); return count; } static DEVICE_ATTR_RW(brightness); #define TEMP 0x33 /* RAM location for temperature */ #define SIGN 0x34 /* RAM location for temperature sign */ static ssize_t temp_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct usb_cytherm *cytherm = usb_get_intfdata(intf); int retval; unsigned char *buffer; int temp, sign; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return 0; /* read temperature */ retval = vendor_command(cytherm->udev, READ_RAM, TEMP, 0, buffer, 8); if (retval) dev_dbg(&cytherm->udev->dev, "retval = %d\n", retval); temp = buffer[1]; /* read sign */ retval = vendor_command(cytherm->udev, READ_RAM, SIGN, 0, buffer, 8); if (retval) dev_dbg(&cytherm->udev->dev, "retval = %d\n", retval); sign = buffer[1]; kfree(buffer); return sprintf(buf, "%c%i.%i", sign ? '-' : '+', temp >> 1, 5*(temp - ((temp >> 1) << 1))); } static DEVICE_ATTR_RO(temp); #define BUTTON 0x7a static ssize_t button_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct usb_cytherm *cytherm = usb_get_intfdata(intf); int retval; unsigned char *buffer; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return 0; /* check button */ retval = vendor_command(cytherm->udev, READ_RAM, BUTTON, 0, buffer, 8); if (retval) dev_dbg(&cytherm->udev->dev, "retval = %d\n", retval); retval = buffer[1]; kfree(buffer); if (retval) return sprintf(buf, "1"); else return sprintf(buf, "0"); } static DEVICE_ATTR_RO(button); static ssize_t port0_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct usb_cytherm *cytherm = usb_get_intfdata(intf); int retval; unsigned char *buffer; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return 0; retval = vendor_command(cytherm->udev, READ_PORT, 0, 0, buffer, 8); if (retval) dev_dbg(&cytherm->udev->dev, "retval = %d\n", retval); retval = buffer[1]; kfree(buffer); return sprintf(buf, "%d", retval); } static ssize_t port0_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_interface *intf = to_usb_interface(dev); struct usb_cytherm *cytherm = usb_get_intfdata(intf); unsigned char *buffer; int retval; int tmp; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return 0; tmp = simple_strtoul(buf, NULL, 10); if (tmp > 0xFF) tmp = 0xFF; else if (tmp < 0) tmp = 0; retval = vendor_command(cytherm->udev, WRITE_PORT, 0, tmp, buffer, 8); if (retval) dev_dbg(&cytherm->udev->dev, "retval = %d\n", retval); kfree(buffer); return count; } static DEVICE_ATTR_RW(port0); static ssize_t port1_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_interface *intf = to_usb_interface(dev); struct usb_cytherm *cytherm = usb_get_intfdata(intf); int retval; unsigned char *buffer; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return 0; retval = vendor_command(cytherm->udev, READ_PORT, 1, 0, buffer, 8); if (retval) dev_dbg(&cytherm->udev->dev, "retval = %d\n", retval); retval = buffer[1]; kfree(buffer); return sprintf(buf, "%d", retval); } static ssize_t port1_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct usb_interface *intf = to_usb_interface(dev); struct usb_cytherm *cytherm = usb_get_intfdata(intf); unsigned char *buffer; int retval; int tmp; buffer = kmalloc(8, GFP_KERNEL); if (!buffer) return 0; tmp = simple_strtoul(buf, NULL, 10); if (tmp > 0xFF) tmp = 0xFF; else if (tmp < 0) tmp = 0; retval = vendor_command(cytherm->udev, WRITE_PORT, 1, tmp, buffer, 8); if (retval) dev_dbg(&cytherm->udev->dev, "retval = %d\n", retval); kfree(buffer); return count; } static DEVICE_ATTR_RW(port1); static struct attribute *cytherm_attrs[] = { &dev_attr_brightness.attr, &dev_attr_temp.attr, &dev_attr_button.attr, &dev_attr_port0.attr, &dev_attr_port1.attr, NULL, }; ATTRIBUTE_GROUPS(cytherm); static int cytherm_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(interface); struct usb_cytherm *dev; int retval = -ENOMEM; dev = kzalloc(sizeof(struct usb_cytherm), GFP_KERNEL); if (!dev) goto error_mem; dev->udev = usb_get_dev(udev); usb_set_intfdata(interface, dev); dev->brightness = 0xFF; dev_info(&interface->dev, "Cypress thermometer device now attached\n"); return 0; error_mem: return retval; } static void cytherm_disconnect(struct usb_interface *interface) { struct usb_cytherm *dev; dev = usb_get_intfdata(interface); /* first remove the files, then NULL the pointer */ usb_set_intfdata(interface, NULL); usb_put_dev(dev->udev); kfree(dev); dev_info(&interface->dev, "Cypress thermometer now disconnected\n"); } /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver cytherm_driver = { .name = "cytherm", .probe = cytherm_probe, .disconnect = cytherm_disconnect, .id_table = id_table, .dev_groups = cytherm_groups, }; module_usb_driver(cytherm_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 // SPDX-License-Identifier: GPL-2.0-or-later /* * Driver for DVBSky USB2.0 receiver * * Copyright (C) 2013 Max nibble <nibble.max@gmail.com> */ #include "dvb_usb.h" #include "m88ds3103.h" #include "ts2020.h" #include "sp2.h" #include "si2168.h" #include "si2157.h" #define DVBSKY_MSG_DELAY 0/*2000*/ #define DVBSKY_BUF_LEN 64 static int dvb_usb_dvbsky_disable_rc; module_param_named(disable_rc, dvb_usb_dvbsky_disable_rc, int, 0644); MODULE_PARM_DESC(disable_rc, "Disable inbuilt IR receiver."); DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); struct dvbsky_state { u8 ibuf[DVBSKY_BUF_LEN]; u8 obuf[DVBSKY_BUF_LEN]; u8 last_lock; struct i2c_client *i2c_client_demod; struct i2c_client *i2c_client_tuner; struct i2c_client *i2c_client_ci; /* fe hook functions*/ int (*fe_set_voltage)(struct dvb_frontend *fe, enum fe_sec_voltage voltage); int (*fe_read_status)(struct dvb_frontend *fe, enum fe_status *status); }; static int dvbsky_usb_generic_rw(struct dvb_usb_device *d, u8 *wbuf, u16 wlen, u8 *rbuf, u16 rlen) { int ret; struct dvbsky_state *state = d_to_priv(d); mutex_lock(&d->usb_mutex); if (wlen != 0) memcpy(state->obuf, wbuf, wlen); ret = dvb_usbv2_generic_rw_locked(d, state->obuf, wlen, state->ibuf, rlen); if (!ret && (rlen != 0)) memcpy(rbuf, state->ibuf, rlen); mutex_unlock(&d->usb_mutex); return ret; } static int dvbsky_stream_ctrl(struct dvb_usb_device *d, u8 onoff) { struct dvbsky_state *state = d_to_priv(d); static const u8 obuf_pre[3] = { 0x37, 0, 0 }; static const u8 obuf_post[3] = { 0x36, 3, 0 }; int ret; mutex_lock(&d->usb_mutex); memcpy(state->obuf, obuf_pre, 3); ret = dvb_usbv2_generic_write_locked(d, state->obuf, 3); if (!ret && onoff) { msleep(20); memcpy(state->obuf, obuf_post, 3); ret = dvb_usbv2_generic_write_locked(d, state->obuf, 3); } mutex_unlock(&d->usb_mutex); return ret; } static int dvbsky_streaming_ctrl(struct dvb_frontend *fe, int onoff) { struct dvb_usb_device *d = fe_to_d(fe); return dvbsky_stream_ctrl(d, (onoff == 0) ? 0 : 1); } /* GPIO */ static int dvbsky_gpio_ctrl(struct dvb_usb_device *d, u8 gport, u8 value) { int ret; u8 obuf[3], ibuf[2]; obuf[0] = 0x0e; obuf[1] = gport; obuf[2] = value; ret = dvbsky_usb_generic_rw(d, obuf, 3, ibuf, 1); return ret; } /* I2C */ static int dvbsky_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); int ret = 0; u8 ibuf[64], obuf[64]; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; if (num > 2) { dev_err(&d->udev->dev, "too many i2c messages[%d], max 2.", num); ret = -EOPNOTSUPP; goto i2c_error; } if (num == 1) { if (msg[0].len > 60) { dev_err(&d->udev->dev, "too many i2c bytes[%d], max 60.", msg[0].len); ret = -EOPNOTSUPP; goto i2c_error; } if (msg[0].flags & I2C_M_RD) { /* single read */ obuf[0] = 0x09; obuf[1] = 0; obuf[2] = msg[0].len; obuf[3] = msg[0].addr; ret = dvbsky_usb_generic_rw(d, obuf, 4, ibuf, msg[0].len + 1); if (!ret) memcpy(msg[0].buf, &ibuf[1], msg[0].len); } else { /* write */ obuf[0] = 0x08; obuf[1] = msg[0].addr; obuf[2] = msg[0].len; memcpy(&obuf[3], msg[0].buf, msg[0].len); ret = dvbsky_usb_generic_rw(d, obuf, msg[0].len + 3, ibuf, 1); } } else { if ((msg[0].len > 60) || (msg[1].len > 60)) { dev_err(&d->udev->dev, "too many i2c bytes[w-%d][r-%d], max 60.", msg[0].len, msg[1].len); ret = -EOPNOTSUPP; goto i2c_error; } /* write then read */ obuf[0] = 0x09; obuf[1] = msg[0].len; obuf[2] = msg[1].len; obuf[3] = msg[0].addr; memcpy(&obuf[4], msg[0].buf, msg[0].len); ret = dvbsky_usb_generic_rw(d, obuf, msg[0].len + 4, ibuf, msg[1].len + 1); if (!ret) memcpy(msg[1].buf, &ibuf[1], msg[1].len); } i2c_error: mutex_unlock(&d->i2c_mutex); return (ret) ? ret : num; } static u32 dvbsky_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C; } static struct i2c_algorithm dvbsky_i2c_algo = { .master_xfer = dvbsky_i2c_xfer, .functionality = dvbsky_i2c_func, }; #if IS_ENABLED(CONFIG_RC_CORE) static int dvbsky_rc_query(struct dvb_usb_device *d) { u32 code = 0xffff, scancode; u8 rc5_command, rc5_system; u8 obuf[2], ibuf[2], toggle; int ret; obuf[0] = 0x10; ret = dvbsky_usb_generic_rw(d, obuf, 1, ibuf, 2); if (ret == 0) code = (ibuf[0] << 8) | ibuf[1]; if (code != 0xffff) { dev_dbg(&d->udev->dev, "rc code: %x\n", code); rc5_command = code & 0x3F; rc5_system = (code & 0x7C0) >> 6; toggle = (code & 0x800) ? 1 : 0; scancode = rc5_system << 8 | rc5_command; rc_keydown(d->rc_dev, RC_PROTO_RC5, scancode, toggle); } return 0; } static int dvbsky_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc) { if (dvb_usb_dvbsky_disable_rc) { rc->map_name = NULL; return 0; } rc->allowed_protos = RC_PROTO_BIT_RC5; rc->query = dvbsky_rc_query; rc->interval = 300; return 0; } #else #define dvbsky_get_rc_config NULL #endif static int dvbsky_usb_set_voltage(struct dvb_frontend *fe, enum fe_sec_voltage voltage) { struct dvb_usb_device *d = fe_to_d(fe); struct dvbsky_state *state = d_to_priv(d); u8 value; if (voltage == SEC_VOLTAGE_OFF) value = 0; else value = 1; dvbsky_gpio_ctrl(d, 0x80, value); return state->fe_set_voltage(fe, voltage); } static int dvbsky_read_mac_addr(struct dvb_usb_adapter *adap, u8 mac[6]) { struct dvb_usb_device *d = adap_to_d(adap); u8 obuf[] = { 0x1e, 0x00 }; u8 ibuf[6] = { 0 }; struct i2c_msg msg[] = { { .addr = 0x51, .flags = 0, .buf = obuf, .len = 2, }, { .addr = 0x51, .flags = I2C_M_RD, .buf = ibuf, .len = 6, } }; if (i2c_transfer(&d->i2c_adap, msg, 2) == 2) memcpy(mac, ibuf, 6); return 0; } static int dvbsky_usb_read_status(struct dvb_frontend *fe, enum fe_status *status) { struct dvb_usb_device *d = fe_to_d(fe); struct dvbsky_state *state = d_to_priv(d); int ret; ret = state->fe_read_status(fe, status); /* it need resync slave fifo when signal change from unlock to lock.*/ if ((*status & FE_HAS_LOCK) && (!state->last_lock)) dvbsky_stream_ctrl(d, 1); state->last_lock = (*status & FE_HAS_LOCK) ? 1 : 0; return ret; } static int dvbsky_s960_attach(struct dvb_usb_adapter *adap) { struct dvbsky_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct i2c_adapter *i2c_adapter; struct m88ds3103_platform_data m88ds3103_pdata = {}; struct ts2020_config ts2020_config = {}; /* attach demod */ m88ds3103_pdata.clk = 27000000; m88ds3103_pdata.i2c_wr_max = 33; m88ds3103_pdata.clk_out = 0; m88ds3103_pdata.ts_mode = M88DS3103_TS_CI; m88ds3103_pdata.ts_clk = 16000; m88ds3103_pdata.ts_clk_pol = 0; m88ds3103_pdata.agc = 0x99; m88ds3103_pdata.lnb_hv_pol = 1; m88ds3103_pdata.lnb_en_pol = 1; state->i2c_client_demod = dvb_module_probe("m88ds3103", NULL, &d->i2c_adap, 0x68, &m88ds3103_pdata); if (!state->i2c_client_demod) return -ENODEV; adap->fe[0] = m88ds3103_pdata.get_dvb_frontend(state->i2c_client_demod); i2c_adapter = m88ds3103_pdata.get_i2c_adapter(state->i2c_client_demod); /* attach tuner */ ts2020_config.fe = adap->fe[0]; ts2020_config.get_agc_pwm = m88ds3103_get_agc_pwm; state->i2c_client_tuner = dvb_module_probe("ts2020", NULL, i2c_adapter, 0x60, &ts2020_config); if (!state->i2c_client_tuner) { dvb_module_release(state->i2c_client_demod); return -ENODEV; } /* delegate signal strength measurement to tuner */ adap->fe[0]->ops.read_signal_strength = adap->fe[0]->ops.tuner_ops.get_rf_strength; /* hook fe: need to resync the slave fifo when signal locks. */ state->fe_read_status = adap->fe[0]->ops.read_status; adap->fe[0]->ops.read_status = dvbsky_usb_read_status; /* hook fe: LNB off/on is control by Cypress usb chip. */ state->fe_set_voltage = adap->fe[0]->ops.set_voltage; adap->fe[0]->ops.set_voltage = dvbsky_usb_set_voltage; return 0; } static int dvbsky_usb_ci_set_voltage(struct dvb_frontend *fe, enum fe_sec_voltage voltage) { struct dvb_usb_device *d = fe_to_d(fe); struct dvbsky_state *state = d_to_priv(d); u8 value; if (voltage == SEC_VOLTAGE_OFF) value = 0; else value = 1; dvbsky_gpio_ctrl(d, 0x00, value); return state->fe_set_voltage(fe, voltage); } static int dvbsky_ci_ctrl(void *priv, u8 read, int addr, u8 data, int *mem) { struct dvb_usb_device *d = priv; int ret = 0; u8 command[4], respond[2], command_size, respond_size; command[1] = (u8)((addr >> 8) & 0xff); /*high part of address*/ command[2] = (u8)(addr & 0xff); /*low part of address*/ if (read) { command[0] = 0x71; command_size = 3; respond_size = 2; } else { command[0] = 0x70; command[3] = data; command_size = 4; respond_size = 1; } ret = dvbsky_usb_generic_rw(d, command, command_size, respond, respond_size); if (ret) goto err; if (read) *mem = respond[1]; return ret; err: dev_err(&d->udev->dev, "ci control failed=%d\n", ret); return ret; } static int dvbsky_s960c_attach(struct dvb_usb_adapter *adap) { struct dvbsky_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct i2c_adapter *i2c_adapter; struct m88ds3103_platform_data m88ds3103_pdata = {}; struct ts2020_config ts2020_config = {}; struct sp2_config sp2_config = {}; /* attach demod */ m88ds3103_pdata.clk = 27000000; m88ds3103_pdata.i2c_wr_max = 33; m88ds3103_pdata.clk_out = 0; m88ds3103_pdata.ts_mode = M88DS3103_TS_CI; m88ds3103_pdata.ts_clk = 10000; m88ds3103_pdata.ts_clk_pol = 1; m88ds3103_pdata.agc = 0x99; m88ds3103_pdata.lnb_hv_pol = 0; m88ds3103_pdata.lnb_en_pol = 1; state->i2c_client_demod = dvb_module_probe("m88ds3103", NULL, &d->i2c_adap, 0x68, &m88ds3103_pdata); if (!state->i2c_client_demod) return -ENODEV; adap->fe[0] = m88ds3103_pdata.get_dvb_frontend(state->i2c_client_demod); i2c_adapter = m88ds3103_pdata.get_i2c_adapter(state->i2c_client_demod); /* attach tuner */ ts2020_config.fe = adap->fe[0]; ts2020_config.get_agc_pwm = m88ds3103_get_agc_pwm; state->i2c_client_tuner = dvb_module_probe("ts2020", NULL, i2c_adapter, 0x60, &ts2020_config); if (!state->i2c_client_tuner) { dvb_module_release(state->i2c_client_demod); return -ENODEV; } /* attach ci controller */ sp2_config.dvb_adap = &adap->dvb_adap; sp2_config.priv = d; sp2_config.ci_control = dvbsky_ci_ctrl; state->i2c_client_ci = dvb_module_probe("sp2", NULL, &d->i2c_adap, 0x40, &sp2_config); if (!state->i2c_client_ci) { dvb_module_release(state->i2c_client_tuner); dvb_module_release(state->i2c_client_demod); return -ENODEV; } /* delegate signal strength measurement to tuner */ adap->fe[0]->ops.read_signal_strength = adap->fe[0]->ops.tuner_ops.get_rf_strength; /* hook fe: need to resync the slave fifo when signal locks. */ state->fe_read_status = adap->fe[0]->ops.read_status; adap->fe[0]->ops.read_status = dvbsky_usb_read_status; /* hook fe: LNB off/on is control by Cypress usb chip. */ state->fe_set_voltage = adap->fe[0]->ops.set_voltage; adap->fe[0]->ops.set_voltage = dvbsky_usb_ci_set_voltage; return 0; } static int dvbsky_t680c_attach(struct dvb_usb_adapter *adap) { struct dvbsky_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct i2c_adapter *i2c_adapter; struct si2168_config si2168_config = {}; struct si2157_config si2157_config = {}; struct sp2_config sp2_config = {}; /* attach demod */ si2168_config.i2c_adapter = &i2c_adapter; si2168_config.fe = &adap->fe[0]; si2168_config.ts_mode = SI2168_TS_PARALLEL; state->i2c_client_demod = dvb_module_probe("si2168", NULL, &d->i2c_adap, 0x64, &si2168_config); if (!state->i2c_client_demod) return -ENODEV; /* attach tuner */ si2157_config.fe = adap->fe[0]; si2157_config.if_port = 1; state->i2c_client_tuner = dvb_module_probe("si2157", NULL, i2c_adapter, 0x60, &si2157_config); if (!state->i2c_client_tuner) { dvb_module_release(state->i2c_client_demod); return -ENODEV; } /* attach ci controller */ sp2_config.dvb_adap = &adap->dvb_adap; sp2_config.priv = d; sp2_config.ci_control = dvbsky_ci_ctrl; state->i2c_client_ci = dvb_module_probe("sp2", NULL, &d->i2c_adap, 0x40, &sp2_config); if (!state->i2c_client_ci) { dvb_module_release(state->i2c_client_tuner); dvb_module_release(state->i2c_client_demod); return -ENODEV; } return 0; } static int dvbsky_t330_attach(struct dvb_usb_adapter *adap) { struct dvbsky_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct i2c_adapter *i2c_adapter; struct si2168_config si2168_config = {}; struct si2157_config si2157_config = {}; /* attach demod */ si2168_config.i2c_adapter = &i2c_adapter; si2168_config.fe = &adap->fe[0]; si2168_config.ts_mode = SI2168_TS_PARALLEL; si2168_config.ts_clock_gapped = true; state->i2c_client_demod = dvb_module_probe("si2168", NULL, &d->i2c_adap, 0x64, &si2168_config); if (!state->i2c_client_demod) return -ENODEV; /* attach tuner */ si2157_config.fe = adap->fe[0]; si2157_config.if_port = 1; state->i2c_client_tuner = dvb_module_probe("si2157", NULL, i2c_adapter, 0x60, &si2157_config); if (!state->i2c_client_tuner) { dvb_module_release(state->i2c_client_demod); return -ENODEV; } return 0; } static int dvbsky_mygica_t230c_attach(struct dvb_usb_adapter *adap) { struct dvbsky_state *state = adap_to_priv(adap); struct dvb_usb_device *d = adap_to_d(adap); struct i2c_adapter *i2c_adapter; struct si2168_config si2168_config = {}; struct si2157_config si2157_config = {}; /* attach demod */ si2168_config.i2c_adapter = &i2c_adapter; si2168_config.fe = &adap->fe[0]; si2168_config.ts_mode = SI2168_TS_PARALLEL; if (le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_MYGICA_T230C2 || le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_MYGICA_T230C2_LITE || le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_MYGICA_T230A) si2168_config.ts_mode |= SI2168_TS_CLK_MANUAL; si2168_config.ts_clock_inv = 1; state->i2c_client_demod = dvb_module_probe("si2168", NULL, &d->i2c_adap, 0x64, &si2168_config); if (!state->i2c_client_demod) return -ENODEV; /* attach tuner */ si2157_config.fe = adap->fe[0]; if (le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_MYGICA_T230) { si2157_config.if_port = 1; state->i2c_client_tuner = dvb_module_probe("si2157", NULL, i2c_adapter, 0x60, &si2157_config); } else { si2157_config.if_port = 0; state->i2c_client_tuner = dvb_module_probe("si2157", "si2141", i2c_adapter, 0x60, &si2157_config); } if (!state->i2c_client_tuner) { dvb_module_release(state->i2c_client_demod); return -ENODEV; } return 0; } static int dvbsky_identify_state(struct dvb_usb_device *d, const char **name) { if (le16_to_cpu(d->udev->descriptor.idProduct) == USB_PID_MYGICA_T230A) { dvbsky_gpio_ctrl(d, 0x87, 0); msleep(20); dvbsky_gpio_ctrl(d, 0x86, 1); dvbsky_gpio_ctrl(d, 0x80, 0); msleep(100); dvbsky_gpio_ctrl(d, 0x80, 1); msleep(50); } else { dvbsky_gpio_ctrl(d, 0x04, 1); msleep(20); dvbsky_gpio_ctrl(d, 0x83, 0); dvbsky_gpio_ctrl(d, 0xc0, 1); msleep(100); dvbsky_gpio_ctrl(d, 0x83, 1); dvbsky_gpio_ctrl(d, 0xc0, 0); msleep(50); } return WARM; } static int dvbsky_init(struct dvb_usb_device *d) { struct dvbsky_state *state = d_to_priv(d); state->last_lock = 0; return 0; } static int dvbsky_frontend_detach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); struct dvbsky_state *state = d_to_priv(d); dev_dbg(&d->udev->dev, "%s: adap=%d\n", __func__, adap->id); dvb_module_release(state->i2c_client_tuner); dvb_module_release(state->i2c_client_demod); dvb_module_release(state->i2c_client_ci); return 0; } /* DVB USB Driver stuff */ static struct dvb_usb_device_properties dvbsky_s960_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .size_of_priv = sizeof(struct dvbsky_state), .generic_bulk_ctrl_endpoint = 0x01, .generic_bulk_ctrl_endpoint_response = 0x81, .generic_bulk_ctrl_delay = DVBSKY_MSG_DELAY, .i2c_algo = &dvbsky_i2c_algo, .frontend_attach = dvbsky_s960_attach, .frontend_detach = dvbsky_frontend_detach, .init = dvbsky_init, .get_rc_config = dvbsky_get_rc_config, .streaming_ctrl = dvbsky_streaming_ctrl, .identify_state = dvbsky_identify_state, .read_mac_address = dvbsky_read_mac_addr, .num_adapters = 1, .adapter = { { .stream = DVB_USB_STREAM_BULK(0x82, 8, 4096), } } }; static struct dvb_usb_device_properties dvbsky_s960c_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .size_of_priv = sizeof(struct dvbsky_state), .generic_bulk_ctrl_endpoint = 0x01, .generic_bulk_ctrl_endpoint_response = 0x81, .generic_bulk_ctrl_delay = DVBSKY_MSG_DELAY, .i2c_algo = &dvbsky_i2c_algo, .frontend_attach = dvbsky_s960c_attach, .frontend_detach = dvbsky_frontend_detach, .init = dvbsky_init, .get_rc_config = dvbsky_get_rc_config, .streaming_ctrl = dvbsky_streaming_ctrl, .identify_state = dvbsky_identify_state, .read_mac_address = dvbsky_read_mac_addr, .num_adapters = 1, .adapter = { { .stream = DVB_USB_STREAM_BULK(0x82, 8, 4096), } } }; static struct dvb_usb_device_properties dvbsky_t680c_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .size_of_priv = sizeof(struct dvbsky_state), .generic_bulk_ctrl_endpoint = 0x01, .generic_bulk_ctrl_endpoint_response = 0x81, .generic_bulk_ctrl_delay = DVBSKY_MSG_DELAY, .i2c_algo = &dvbsky_i2c_algo, .frontend_attach = dvbsky_t680c_attach, .frontend_detach = dvbsky_frontend_detach, .init = dvbsky_init, .get_rc_config = dvbsky_get_rc_config, .streaming_ctrl = dvbsky_streaming_ctrl, .identify_state = dvbsky_identify_state, .read_mac_address = dvbsky_read_mac_addr, .num_adapters = 1, .adapter = { { .stream = DVB_USB_STREAM_BULK(0x82, 8, 4096), } } }; static struct dvb_usb_device_properties dvbsky_t330_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .size_of_priv = sizeof(struct dvbsky_state), .generic_bulk_ctrl_endpoint = 0x01, .generic_bulk_ctrl_endpoint_response = 0x81, .generic_bulk_ctrl_delay = DVBSKY_MSG_DELAY, .i2c_algo = &dvbsky_i2c_algo, .frontend_attach = dvbsky_t330_attach, .frontend_detach = dvbsky_frontend_detach, .init = dvbsky_init, .get_rc_config = dvbsky_get_rc_config, .streaming_ctrl = dvbsky_streaming_ctrl, .identify_state = dvbsky_identify_state, .read_mac_address = dvbsky_read_mac_addr, .num_adapters = 1, .adapter = { { .stream = DVB_USB_STREAM_BULK(0x82, 8, 4096), } } }; static struct dvb_usb_device_properties mygica_t230c_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .size_of_priv = sizeof(struct dvbsky_state), .generic_bulk_ctrl_endpoint = 0x01, .generic_bulk_ctrl_endpoint_response = 0x81, .generic_bulk_ctrl_delay = DVBSKY_MSG_DELAY, .i2c_algo = &dvbsky_i2c_algo, .frontend_attach = dvbsky_mygica_t230c_attach, .frontend_detach = dvbsky_frontend_detach, .init = dvbsky_init, .get_rc_config = dvbsky_get_rc_config, .streaming_ctrl = dvbsky_streaming_ctrl, .identify_state = dvbsky_identify_state, .num_adapters = 1, .adapter = { { .stream = DVB_USB_STREAM_BULK(0x82, 8, 4096), } } }; static const struct usb_device_id dvbsky_id_table[] = { { DVB_USB_DEVICE(0x0572, 0x6831, &dvbsky_s960_props, "DVBSky S960/S860", RC_MAP_DVBSKY) }, { DVB_USB_DEVICE(0x0572, 0x960c, &dvbsky_s960c_props, "DVBSky S960CI", RC_MAP_DVBSKY) }, { DVB_USB_DEVICE(0x0572, 0x680c, &dvbsky_t680c_props, "DVBSky T680CI", RC_MAP_DVBSKY) }, { DVB_USB_DEVICE(0x0572, 0x0320, &dvbsky_t330_props, "DVBSky T330", RC_MAP_DVBSKY) }, { DVB_USB_DEVICE(USB_VID_TECHNOTREND, USB_PID_TECHNOTREND_TVSTICK_CT2_4400, &dvbsky_t330_props, "TechnoTrend TVStick CT2-4400", RC_MAP_TT_1500) }, { DVB_USB_DEVICE(USB_VID_TECHNOTREND, USB_PID_TECHNOTREND_CONNECT_CT2_4650_CI, &dvbsky_t680c_props, "TechnoTrend TT-connect CT2-4650 CI", RC_MAP_TT_1500) }, { DVB_USB_DEVICE(USB_VID_TECHNOTREND, USB_PID_TECHNOTREND_CONNECT_CT2_4650_CI_2, &dvbsky_t680c_props, "TechnoTrend TT-connect CT2-4650 CI v1.1", RC_MAP_TT_1500) }, { DVB_USB_DEVICE(USB_VID_TECHNOTREND, USB_PID_TECHNOTREND_CONNECT_S2_4650_CI, &dvbsky_s960c_props, "TechnoTrend TT-connect S2-4650 CI", RC_MAP_TT_1500) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_H7_3, &dvbsky_t680c_props, "Terratec H7 Rev.4", RC_MAP_TT_1500) }, { DVB_USB_DEVICE(USB_VID_TERRATEC, USB_PID_TERRATEC_CINERGY_S2_R4, &dvbsky_s960_props, "Terratec Cinergy S2 Rev.4", RC_MAP_DVBSKY) }, { DVB_USB_DEVICE(USB_VID_CONEXANT, USB_PID_MYGICA_T230, &mygica_t230c_props, "MyGica Mini DVB-(T/T2/C) USB Stick T230", RC_MAP_TOTAL_MEDIA_IN_HAND_02) }, { DVB_USB_DEVICE(USB_VID_CONEXANT, USB_PID_MYGICA_T230C, &mygica_t230c_props, "MyGica Mini DVB-(T/T2/C) USB Stick T230C", RC_MAP_TOTAL_MEDIA_IN_HAND_02) }, { DVB_USB_DEVICE(USB_VID_CONEXANT, USB_PID_MYGICA_T230C_LITE, &mygica_t230c_props, "MyGica Mini DVB-(T/T2/C) USB Stick T230C Lite", NULL) }, { DVB_USB_DEVICE(USB_VID_CONEXANT, USB_PID_MYGICA_T230C2, &mygica_t230c_props, "MyGica Mini DVB-(T/T2/C) USB Stick T230C v2", RC_MAP_TOTAL_MEDIA_IN_HAND_02) }, { DVB_USB_DEVICE(USB_VID_CONEXANT, USB_PID_MYGICA_T230C2_LITE, &mygica_t230c_props, "MyGica Mini DVB-(T/T2/C) USB Stick T230C v2 Lite", NULL) }, { DVB_USB_DEVICE(USB_VID_CONEXANT, USB_PID_MYGICA_T230A, &mygica_t230c_props, "MyGica Mini DVB-(T/T2/C) USB Stick T230A", NULL) }, { } }; MODULE_DEVICE_TABLE(usb, dvbsky_id_table); static struct usb_driver dvbsky_usb_driver = { .name = KBUILD_MODNAME, .id_table = dvbsky_id_table, .probe = dvb_usbv2_probe, .disconnect = dvb_usbv2_disconnect, .suspend = dvb_usbv2_suspend, .resume = dvb_usbv2_resume, .reset_resume = dvb_usbv2_reset_resume, .no_dynamic_id = 1, .soft_unbind = 1, }; module_usb_driver(dvbsky_usb_driver); MODULE_AUTHOR("Max nibble <nibble.max@gmail.com>"); MODULE_DESCRIPTION("Driver for DVBSky USB"); MODULE_LICENSE("GPL");
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 // SPDX-License-Identifier: GPL-2.0-only /*************************************************************************** * Copyright (C) 2010-2012 by Bruno Prémont <bonbons@linux-vserver.org> * * * * Based on Logitech G13 driver (v0.4) * * Copyright (C) 2009 by Rick L. Vinyard, Jr. <rvinyard@cs.nmsu.edu> * * * ***************************************************************************/ #include <linux/hid.h> #include <linux/hid-debug.h> #include <linux/input.h> #include "hid-ids.h" #include <linux/fb.h> #include <linux/vmalloc.h> #include <linux/backlight.h> #include <linux/lcd.h> #include <linux/leds.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/completion.h> #include <linux/uaccess.h> #include <linux/module.h> #include <media/rc-core.h> #include "hid-picolcd.h" int picolcd_raw_cir(struct picolcd_data *data, struct hid_report *report, u8 *raw_data, int size) { unsigned long flags; int i, w, sz; struct ir_raw_event rawir = {}; /* ignore if rc_dev is NULL or status is shunned */ spin_lock_irqsave(&data->lock, flags); if (!data->rc_dev || (data->status & PICOLCD_CIR_SHUN)) { spin_unlock_irqrestore(&data->lock, flags); return 1; } spin_unlock_irqrestore(&data->lock, flags); /* PicoLCD USB packets contain 16-bit intervals in network order, * with value negated for pulse. Intervals are in microseconds. * * Note: some userspace LIRC code for PicoLCD says negated values * for space - is it a matter of IR chip? (pulse for my TSOP2236) * * In addition, the first interval seems to be around 15000 + base * interval for non-first report of IR data - thus the quirk below * to get RC_CODE to understand Sony and JVC remotes I have at hand */ sz = size > 0 ? min((int)raw_data[0], size-1) : 0; for (i = 0; i+1 < sz; i += 2) { w = (raw_data[i] << 8) | (raw_data[i+1]); rawir.pulse = !!(w & 0x8000); rawir.duration = rawir.pulse ? (65536 - w) : w; /* Quirk!! - see above */ if (i == 0 && rawir.duration > 15000) rawir.duration -= 15000; ir_raw_event_store(data->rc_dev, &rawir); } ir_raw_event_handle(data->rc_dev); return 1; } static int picolcd_cir_open(struct rc_dev *dev) { struct picolcd_data *data = dev->priv; unsigned long flags; spin_lock_irqsave(&data->lock, flags); data->status &= ~PICOLCD_CIR_SHUN; spin_unlock_irqrestore(&data->lock, flags); return 0; } static void picolcd_cir_close(struct rc_dev *dev) { struct picolcd_data *data = dev->priv; unsigned long flags; spin_lock_irqsave(&data->lock, flags); data->status |= PICOLCD_CIR_SHUN; spin_unlock_irqrestore(&data->lock, flags); } /* initialize CIR input device */ int picolcd_init_cir(struct picolcd_data *data, struct hid_report *report) { struct rc_dev *rdev; int ret = 0; rdev = rc_allocate_device(RC_DRIVER_IR_RAW); if (!rdev) return -ENOMEM; rdev->priv = data; rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; rdev->open = picolcd_cir_open; rdev->close = picolcd_cir_close; rdev->device_name = data->hdev->name; rdev->input_phys = data->hdev->phys; rdev->input_id.bustype = data->hdev->bus; rdev->input_id.vendor = data->hdev->vendor; rdev->input_id.product = data->hdev->product; rdev->input_id.version = data->hdev->version; rdev->dev.parent = &data->hdev->dev; rdev->driver_name = PICOLCD_NAME; rdev->map_name = RC_MAP_RC6_MCE; rdev->timeout = MS_TO_US(100); rdev->rx_resolution = 1; ret = rc_register_device(rdev); if (ret) goto err; data->rc_dev = rdev; return 0; err: rc_free_device(rdev); return ret; } void picolcd_exit_cir(struct picolcd_data *data) { struct rc_dev *rdev = data->rc_dev; data->rc_dev = NULL; rc_unregister_device(rdev); }
42 30 32 29 19 8 14 12 3 1 3 2 1 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 1 2 1 3 2 1 3 2 2 2 2 2 2 2 2 3 3 3 2 3 1 3 3 1 3 2 1 3 1 1 27 2 1 1 2 9 9 2 2 2 1 2 1 4 2 1 4 3 1 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/main.c - PM subsystem core functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab */ #include <linux/acpi.h> #include <linux/export.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/pm-trace.h> #include <linux/workqueue.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/pm_runtime.h> #include "power.h" #ifdef CONFIG_PM_SLEEP /* * The following functions are used by the suspend/hibernate code to temporarily * change gfp_allowed_mask in order to avoid using I/O during memory allocations * while devices are suspended. To avoid races with the suspend/hibernate code, * they should always be called with system_transition_mutex held * (gfp_allowed_mask also should only be modified with system_transition_mutex * held, unless the suspend/hibernate code is guaranteed not to run in parallel * with that modification). */ static gfp_t saved_gfp_mask; void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); if (saved_gfp_mask) { gfp_allowed_mask = saved_gfp_mask; saved_gfp_mask = 0; } } void pm_restrict_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); } unsigned int lock_system_sleep(void) { unsigned int flags = current->flags; current->flags |= PF_NOFREEZE; mutex_lock(&system_transition_mutex); return flags; } EXPORT_SYMBOL_GPL(lock_system_sleep); void unlock_system_sleep(unsigned int flags) { if (!(flags & PF_NOFREEZE)) current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); void ksys_sync_helper(void) { ktime_t start; long elapsed_msecs; start = ktime_get(); ksys_sync(); elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); pr_info("Filesystems sync: %ld.%03ld seconds\n", elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); } EXPORT_SYMBOL_GPL(ksys_sync_helper); /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); int register_pm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&pm_chain_head, nb); } EXPORT_SYMBOL_GPL(register_pm_notifier); int unregister_pm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&pm_chain_head, nb); } EXPORT_SYMBOL_GPL(unregister_pm_notifier); int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { int ret; ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL); return notifier_to_errno(ret); } int pm_notifier_call_chain(unsigned long val) { return blocking_notifier_call_chain(&pm_chain_head, val, NULL); } /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_async_enabled); } static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_async_enabled = val; return n; } power_attr(pm_async); #ifdef CONFIG_SUSPEND static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { if (i >= PM_SUSPEND_MEM && cxl_mem_active()) continue; if (mem_sleep_states[i]) { const char *label = mem_sleep_states[i]; if (mem_sleep_current == i) count += sysfs_emit_at(buf, count, "[%s] ", label); else count += sysfs_emit_at(buf, count, "%s ", label); } } /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static suspend_state_t decode_suspend_state(const char *buf, size_t n) { suspend_state_t state; char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { const char *label = mem_sleep_states[state]; if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; } return PM_SUSPEND_ON; } static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } state = decode_suspend_state(buf, n); if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) mem_sleep_current = state; else error = -EINVAL; out: pm_autosleep_unlock(); return error ? error : n; } power_attr(mem_sleep); /* * sync_on_suspend: invoke ksys_sync_helper() before suspend. * * show() returns whether ksys_sync_helper() is invoked before suspend. * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. */ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); static ssize_t sync_on_suspend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", sync_on_suspend_enabled); } static ssize_t sync_on_suspend_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; sync_on_suspend_enabled = !!val; return n; } power_attr(sync_on_suspend); #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_PM_SLEEP_DEBUG int pm_test_level = TEST_NONE; static const char * const pm_tests[__TEST_AFTER_LAST] = { [TEST_NONE] = "none", [TEST_CORE] = "core", [TEST_CPUS] = "processors", [TEST_PLATFORM] = "platform", [TEST_DEVICES] = "devices", [TEST_FREEZER] = "freezer", }; static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; int level; for (level = TEST_FIRST; level <= TEST_MAX; level++) if (pm_tests[level]) { if (level == pm_test_level) count += sysfs_emit_at(buf, count, "[%s] ", pm_tests[level]); else count += sysfs_emit_at(buf, count, "%s ", pm_tests[level]); } /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int sleep_flags; const char * const *s; int error = -EINVAL; int level; char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; sleep_flags = lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { pm_test_level = level; error = 0; break; } unlock_system_sleep(sleep_flags); return error ? error : n; } power_attr(pm_test); #endif /* CONFIG_PM_SLEEP_DEBUG */ #define SUSPEND_NR_STEPS SUSPEND_RESUME #define REC_FAILED_NUM 2 struct suspend_stats { unsigned int step_failures[SUSPEND_NR_STEPS]; unsigned int success; unsigned int fail; int last_failed_dev; char failed_devs[REC_FAILED_NUM][40]; int last_failed_errno; int errno[REC_FAILED_NUM]; int last_failed_step; u64 last_hw_sleep; u64 total_hw_sleep; u64 max_hw_sleep; enum suspend_stat_step failed_steps[REC_FAILED_NUM]; }; static struct suspend_stats suspend_stats; static DEFINE_MUTEX(suspend_stats_lock); void dpm_save_failed_dev(const char *name) { mutex_lock(&suspend_stats_lock); strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], name, sizeof(suspend_stats.failed_devs[0])); suspend_stats.last_failed_dev++; suspend_stats.last_failed_dev %= REC_FAILED_NUM; mutex_unlock(&suspend_stats_lock); } void dpm_save_failed_step(enum suspend_stat_step step) { suspend_stats.step_failures[step-1]++; suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; suspend_stats.last_failed_step++; suspend_stats.last_failed_step %= REC_FAILED_NUM; } void dpm_save_errno(int err) { if (!err) { suspend_stats.success++; return; } suspend_stats.fail++; suspend_stats.errno[suspend_stats.last_failed_errno] = err; suspend_stats.last_failed_errno++; suspend_stats.last_failed_errno %= REC_FAILED_NUM; } void pm_report_hw_sleep_time(u64 t) { suspend_stats.last_hw_sleep = t; suspend_stats.total_hw_sleep += t; } EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); void pm_report_max_hw_sleep(u64 t) { suspend_stats.max_hw_sleep = t; } EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); static const char * const suspend_step_names[] = { [SUSPEND_WORKING] = "", [SUSPEND_FREEZE] = "freeze", [SUSPEND_PREPARE] = "prepare", [SUSPEND_SUSPEND] = "suspend", [SUSPEND_SUSPEND_LATE] = "suspend_late", [SUSPEND_SUSPEND_NOIRQ] = "suspend_noirq", [SUSPEND_RESUME_NOIRQ] = "resume_noirq", [SUSPEND_RESUME_EARLY] = "resume_early", [SUSPEND_RESUME] = "resume", }; #define suspend_attr(_name, format_str) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ return sysfs_emit(buf, format_str, suspend_stats._name);\ } \ static struct kobj_attribute _name = __ATTR_RO(_name) suspend_attr(success, "%u\n"); suspend_attr(fail, "%u\n"); suspend_attr(last_hw_sleep, "%llu\n"); suspend_attr(total_hw_sleep, "%llu\n"); suspend_attr(max_hw_sleep, "%llu\n"); #define suspend_step_attr(_name, step) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ return sysfs_emit(buf, "%u\n", \ suspend_stats.step_failures[step-1]); \ } \ static struct kobj_attribute _name = __ATTR_RO(_name) suspend_step_attr(failed_freeze, SUSPEND_FREEZE); suspend_step_attr(failed_prepare, SUSPEND_PREPARE); suspend_step_attr(failed_suspend, SUSPEND_SUSPEND); suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE); suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ); suspend_step_attr(failed_resume, SUSPEND_RESUME); suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY); suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ); static ssize_t last_failed_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; char *last_failed_dev = NULL; index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; last_failed_dev = suspend_stats.failed_devs[index]; return sysfs_emit(buf, "%s\n", last_failed_dev); } static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); static ssize_t last_failed_errno_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; int last_failed_errno; index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; last_failed_errno = suspend_stats.errno[index]; return sysfs_emit(buf, "%d\n", last_failed_errno); } static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); static ssize_t last_failed_step_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { enum suspend_stat_step step; int index; index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; step = suspend_stats.failed_steps[index]; return sysfs_emit(buf, "%s\n", suspend_step_names[step]); } static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); static struct attribute *suspend_attrs[] = { &success.attr, &fail.attr, &failed_freeze.attr, &failed_prepare.attr, &failed_suspend.attr, &failed_suspend_late.attr, &failed_suspend_noirq.attr, &failed_resume.attr, &failed_resume_early.attr, &failed_resume_noirq.attr, &last_failed_dev.attr, &last_failed_errno.attr, &last_failed_step.attr, &last_hw_sleep.attr, &total_hw_sleep.attr, &max_hw_sleep.attr, NULL, }; static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx) { if (attr != &last_hw_sleep.attr && attr != &total_hw_sleep.attr && attr != &max_hw_sleep.attr) return 0444; #ifdef CONFIG_ACPI if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) return 0444; #endif return 0; } static const struct attribute_group suspend_attr_group = { .name = "suspend_stats", .attrs = suspend_attrs, .is_visible = suspend_attr_is_visible, }; #ifdef CONFIG_DEBUG_FS static int suspend_stats_show(struct seq_file *s, void *unused) { int i, index, last_dev, last_errno, last_step; enum suspend_stat_step step; last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; last_dev %= REC_FAILED_NUM; last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; seq_printf(s, "success: %u\nfail: %u\n", suspend_stats.success, suspend_stats.fail); for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++) seq_printf(s, "failed_%s: %u\n", suspend_step_names[step], suspend_stats.step_failures[step-1]); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", suspend_stats.failed_devs[last_dev]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_dev + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]); } seq_printf(s, " last_failed_errno:\t%-d\n", suspend_stats.errno[last_errno]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_errno + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]); } seq_printf(s, " last_failed_step:\t%-s\n", suspend_step_names[suspend_stats.failed_steps[last_step]]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_step + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", suspend_step_names[suspend_stats.failed_steps[index]]); } return 0; } DEFINE_SHOW_ATTRIBUTE(suspend_stats); static int __init pm_debugfs_init(void) { debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, NULL, NULL, &suspend_stats_fops); return 0; } late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG /* * pm_print_times: print time taken by devices to suspend and resume. * * show() returns whether printing of suspend and resume times is enabled. * store() accepts 0 or 1. 0 disables printing and 1 enables it. */ bool pm_print_times_enabled; static ssize_t pm_print_times_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_print_times_enabled); } static ssize_t pm_print_times_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_print_times_enabled = !!val; return n; } power_attr(pm_print_times); static inline void pm_print_times_init(void) { pm_print_times_enabled = !!initcall_debug; } static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (!pm_wakeup_irq()) return -ENODATA; return sysfs_emit(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_debug_messages_on); } static ssize_t pm_debug_messages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_debug_messages_on = !!val; return n; } power_attr(pm_debug_messages); static int __init pm_debug_messages_setup(char *str) { pm_debug_messages_on = true; return 1; } __setup("pm_debug_messages", pm_debug_messages_setup); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ struct kobject *power_kobj; /* * state - control system sleep states. * * show() returns available sleep state labels, which may be "mem", "standby", * "freeze" and "disk" (hibernation). * See Documentation/admin-guide/pm/sleep-states.rst for a description of * what they mean. * * store() accepts one of those strings, translates it into the proper * enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; #ifdef CONFIG_SUSPEND suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (pm_states[i]) count += sysfs_emit_at(buf, count, "%s ", pm_states[i]); #endif if (hibernation_available()) count += sysfs_emit_at(buf, count, "disk "); /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state; #endif char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; /* Check hibernation first. */ if (len == 4 && str_has_prefix(buf, "disk")) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { const char *label = pm_states[state]; if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; } #endif return PM_SUSPEND_ON; } static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } state = decode_state(buf, n); if (state < PM_SUSPEND_MAX) { if (state == PM_SUSPEND_MEM) state = mem_sleep_current; error = pm_suspend(state); } else if (state == PM_SUSPEND_MAX) { error = hibernate(); } else { error = -EINVAL; } out: pm_autosleep_unlock(); return error ? error : n; } power_attr(state); #ifdef CONFIG_PM_SLEEP /* * The 'wakeup_count' attribute, along with the functions defined in * drivers/base/power/wakeup.c, provides a means by which wakeup events can be * handled in a non-racy way. * * If a wakeup event occurs when the system is in a sleep state, it simply is * woken up. In turn, if an event that would wake the system up from a sleep * state occurs when it is undergoing a transition to that sleep state, the * transition should be aborted. Moreover, if such an event occurs when the * system is in the working state, an attempt to start a transition to the * given sleep state should fail during certain period after the detection of * the event. Using the 'state' attribute alone is not sufficient to satisfy * these requirements, because a wakeup event may occur exactly when 'state' * is being written to and may be delivered to user space right before it is * frozen, so the event will remain only partially processed until the system is * woken up by another event. In particular, it won't cause the transition to * a sleep state to be aborted. * * This difficulty may be overcome if user space uses 'wakeup_count' before * writing to 'state'. It first should read from 'wakeup_count' and store * the read value. Then, after carrying out its own preparations for the system * transition to a sleep state, it should write the stored value to * 'wakeup_count'. If that fails, at least one wakeup event has occurred since * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it * is allowed to write to 'state', but the transition will be aborted if there * are any wakeup events detected after 'wakeup_count' was written to. */ static ssize_t wakeup_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int val; return pm_get_wakeup_count(&val, true) ? sysfs_emit(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int val; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } error = -EINVAL; if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) error = n; else pm_print_active_wakeup_sources(); } out: pm_autosleep_unlock(); return error; } power_attr(wakeup_count); #ifdef CONFIG_PM_AUTOSLEEP static ssize_t autosleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { suspend_state_t state = pm_autosleep_state(); if (state == PM_SUSPEND_ON) return sysfs_emit(buf, "off\n"); #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) return sysfs_emit(buf, "%s\n", pm_states[state] ? pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION return sysfs_emit(buf, "disk\n"); #else return sysfs_emit(buf, "error\n"); #endif } static ssize_t autosleep_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state = decode_state(buf, n); int error; if (state == PM_SUSPEND_ON && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; if (state == PM_SUSPEND_MEM) state = mem_sleep_current; error = pm_autosleep_set_state(state); return error ? error : n; } power_attr(autosleep); #endif /* CONFIG_PM_AUTOSLEEP */ #ifdef CONFIG_PM_WAKELOCKS static ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return pm_show_wakelocks(buf, true); } static ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int error = pm_wake_lock(buf); return error ? error : n; } power_attr(wake_lock); static ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return pm_show_wakelocks(buf, false); } static ssize_t wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int error = pm_wake_unlock(buf); return error ? error : n; } power_attr(wake_unlock); #endif /* CONFIG_PM_WAKELOCKS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_TRACE int pm_trace_enabled; static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_trace_enabled); } static ssize_t pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int val; if (sscanf(buf, "%d", &val) == 1) { pm_trace_enabled = !!val; if (pm_trace_enabled) { pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" "PM: Correct system time has to be restored manually after resume.\n"); } return n; } return -EINVAL; } power_attr(pm_trace); static ssize_t pm_trace_dev_match_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return show_trace_dev_match(buf, PAGE_SIZE); } power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ #ifdef CONFIG_FREEZER static ssize_t pm_freeze_timeout_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", freeze_timeout_msecs); } static ssize_t pm_freeze_timeout_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; freeze_timeout_msecs = val; return n; } power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, &pm_trace_dev_match_attr.attr, #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, #ifdef CONFIG_SUSPEND &mem_sleep_attr.attr, &sync_on_suspend_attr.attr, #endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif #ifdef CONFIG_PM_WAKELOCKS &wake_lock_attr.attr, &wake_unlock_attr.attr, #endif #ifdef CONFIG_PM_SLEEP_DEBUG &pm_test_attr.attr, &pm_print_times_attr.attr, &pm_wakeup_irq_attr.attr, &pm_debug_messages_attr.attr, #endif #endif #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif NULL, }; static const struct attribute_group attr_group = { .attrs = g, }; static const struct attribute_group *attr_groups[] = { &attr_group, #ifdef CONFIG_PM_SLEEP &suspend_attr_group, #endif NULL, }; struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); return pm_wq ? 0 : -ENOMEM; } static int __init pm_init(void) { int error = pm_start_workqueue(); if (error) return error; hibernate_image_size_init(); hibernate_reserved_size_init(); pm_states_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; error = sysfs_create_groups(power_kobj, attr_groups); if (error) return error; pm_print_times_init(); return pm_autosleep_init(); } core_initcall(pm_init);
6925 1361 1221 6921 7034 41 2462 4798 757 59 709 755 755 38 59 38 45 4756 4757 206 148 4691 2462 2627 4121 45 4132 4120 2263 4131 7 4114 2336 2341 639 2345 119 119 49 118 85 85 43 44 44 11 11 11 41 41 18 40 2002 2013 2016 2015 2016 1791 4799 1791 84 79 215 2204 2195 1208 2 4789 393 4793 94 94 4775 4786 3990 1218 4757 4492 860 4752 2201 4754 2206 4754 4759 4768 2157 4788 4795 4791 6278 3002 2996 1290 2501 2498 2338 550 513 15 3 3 15 10971 10965 10976 10995 10127 11113 10966 10965 11006 10151 4744 4760 90 87 6 44 44 27 44 2275 2275 2274 745 744 745 745 1160 167 107 138 149 21 73 72 70 3 324 94 94 94 325 2 2 2 2 2 2 83 82 83 83 2553 85 84 85 7 1 83 83 11 11 11 11 11 11 11 11 30 30 30 228 228 228 228 227 1037 1036 1028 1034 49 49 9 49 7 49 1036 1033 7 7 7 6 6 1035 1036 1032 1 2 2 2 1 1 1 1 1 2258 2188 2 2189 2256 2152 2258 2254 2255 2261 1032 49 49 43 16 49 42 1034 1031 43 1033 43 136 136 135 135 136 135 136 134 844 2 2 883 882 1 881 842 842 843 2 883 2 2 10369 10360 2763 10223 518 521 10003 10226 10386 7397 10372 10364 5535 5538 5538 5537 470 18 18 18 18 6582 6585 3241 6581 191 10364 10371 10362 9756 1052 9769 1061 9763 2843 9766 266 9768 3343 9767 429 9769 251 9778 9768 4264 4261 4255 9034 5920 5709 8858 9792 3 8693 8694 24 8705 8706 8490 8485 8483 437 437 438 439 2295 2294 2294 2294 64 61 47 47 42 42 42 42 42 42 42 47 64 64 1169 1174 1175 678 679 219 176 176 176 176 679 8609 679 8201 6916 6701 6696 6709 8616 4975 4963 4768 4962 5194 5179 5206 1695 1156 1156 1153 2 5206 104 104 156 330 156 156 190 190 5358 5357 5374 5361 120 4362 4359 4354 4352 19 19 19 4497 4505 4495 4506 4488 16 16 16 16 4496 4490 4501 4489 4506 21 19 19 19 1 19 19 19 19 19 16 16 4500 4500 4488 3 3 4496 4491 4484 4488 4488 4491 4487 4491 189 393 5332 4341 4352 4339 2237 2236 2245 3770 4335 1508 3600 204 2597 3 1 2 42 8 37 45 382 41 382 76 324 381 3 398 418 418 417 418 418 416 15 14 403 68 359 359 41 418 21 21 21 418 417 397 416 395 417 382 381 367 46 46 46 46 46 416 403 418 418 418 416 21 417 120 415 402 417 361 45 46 46 46 46 492 491 10 3 2 2 7 2 2 2 3419 3360 3357 2079 1147 27 27 2 2 25 10 10 10 1 10 15 27 3430 224 224 181 181 181 181 181 224 33 18 33 18 33 167 167 167 167 401 398 401 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dcache.c * * Complete reimplementation * (C) 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ /* * Notes on the allocation strategy: * * The dcache is a master of the icache - whenever a dcache entry * exists, the inode will always exist. "iput()" is done either when * the dcache entry is deleted or garbage collected. */ #include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> #include "internal.h" #include "mount.h" #include <asm/runtime-const.h> /* * Usage: * dcache->d_inode->i_lock protects: * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: * - the s_roots list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags * - d_name * - d_lru * - d_count * - d_unhashed() * - d_parent and d_chilren * - childrens' d_sib and d_parent * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock * ... * dentry->d_parent->d_lock * dentry->d_lock * * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __ro_after_init; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); const struct qstr dotdot_name = QSTR_INIT("..", 2); EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try * to make this good - I've just made it work. * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. * * Marking the variables "used" ensures that the compiler doesn't * optimize them away completely on architectures with runtime * constant infrastructure, this allows debuggers to see their * values. But updating these values has no effect on those arches. */ static unsigned int d_hash_shift __ro_after_init __used; static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { return runtime_const_ptr(dentry_hashtable) + runtime_const_shift_right_32(hashlen, d_hash_shift); } #define IN_LOOKUP_SHIFT 10 static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } struct dentry_stat_t { long nr_dentry; long nr_unused; long age_limit; /* age in seconds */ long want_pages; /* pages requested by system */ long nr_negative; /* # of unused negative dentries */ long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ static struct dentry_stat_t dentry_stat = { .age_limit = 45, }; /* * Here we resort to our own counters instead of using generic per-cpu counters * for consistency with what the vfs inode code does. We are expected to harvest * better code and performance by having our own specialized counters. * * Please note that the loop is done over all possible CPUs, not over all online * CPUs. The reason for this is that we don't want to play games with CPUs going * on and off. If one of them goes off, we will just keep their counters. * * glommer: See cffbc8a for details, and if you ever intend to change this, * please update all vfs counters to match. */ static long get_nr_dentry(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_unused, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_negative(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_negative, i); return sum < 0 ? 0 : sum; } static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static struct ctl_table fs_dcache_sysctls[] = { { .procname = "dentry-state", .data = &dentry_stat, .maxlen = 6*sizeof(long), .mode = 0444, .proc_handler = proc_nr_dentry, }, { .procname = "dentry-negative", .data = &dentry_negative_policy, .maxlen = sizeof(dentry_negative_policy), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static int __init init_fs_dcache_sysctls(void) { register_sysctl_init("fs", fs_dcache_sysctls); return 0; } fs_initcall(init_fs_dcache_sysctls); #endif /* * Compare 2 name strings, return 0 if they match, otherwise non-zero. * The strings are both count bytes long, and count is non-zero. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> /* * NOTE! 'cs' and 'scount' come from a dentry, so it has a * aligned allocation for this particular component. We don't * strictly need the load_unaligned_zeropad() safety, but it * doesn't hurt either. * * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; for (;;) { a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; if (unlikely(a != b)) return 1; cs += sizeof(unsigned long); ct += sizeof(unsigned long); tcount -= sizeof(unsigned long); if (!tcount) return 0; } mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } #else static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { do { if (*cs != *ct) return 1; cs++; ct++; tcount--; } while (tcount); return 0; } #endif static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { /* * Be careful about RCU walk racing with rename: * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The * RCU walk will check the sequence count eventually, * and catch it. And we won't overrun the buffer, * because we're reading the name pointer atomically, * and a dentry name is guaranteed to be properly * terminated with a NUL byte. * * End result: even if 'len' is wrong, we'll exit * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } struct external_name { union { atomic_t count; struct rcu_head head; } u; unsigned char name[]; }; static inline struct external_name *external_name(struct dentry *dentry) { return container_of(dentry->d_name.name, struct external_name, name[0]); } static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) { return dentry->d_name.name != dentry->d_iname; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { spin_lock(&dentry->d_lock); name->name = dentry->d_name; if (unlikely(dname_external(dentry))) { atomic_inc(&external_name(dentry)->u.count); } else { memcpy(name->inline_name, dentry->d_iname, dentry->d_name.len + 1); name->name.name = name->inline_name; } spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { if (unlikely(name->name.name != name->inline_name)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) kfree_rcu(p, u.head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) { unsigned flags; dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; /* * The negative counter only tracks dentries on the LRU. Don't inc if * d_lru is on another list. */ if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->u.count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else iput(inode); } /* * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry * is in use - which includes both the "real" per-superblock * LRU list _and_ the DCACHE_SHRINK_LIST use. * * The DCACHE_SHRINK_LIST bit is set whenever the dentry is * on the shrink list (ie not on the superblock LRU list). * * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * * The per-cpu "nr_dentry_negative" counters are only updated * when deleted from or added to the per-superblock LRU list, not * from/to the shrink list. That is to avoid an unneeded dec/inc * pair when moving from LRU to shrink list in select_collect(). * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); this_cpu_dec(nr_dentry_unused); } static void d_shrink_add(struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, 0); list_add(&dentry->d_lru, list); dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); } /* * These can only be called under the global LRU lock, ie during the * callback for freeing the LRU list. "isolate" removes it from the * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } static void ___d_drop(struct dentry *dentry) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_root, which are always IS_ROOT: */ if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots; else b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); hlist_bl_unlock(b); } void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { ___d_drop(dentry); dentry->d_hash.pprev = NULL; write_seqcount_invalidate(&dentry->d_seq); } } EXPORT_SYMBOL(__d_drop); /** * d_drop - drop a dentry * @dentry: dentry to drop * * d_drop() unhashes the entry from the parent dentry hashes, so that it won't * be found through a VFS lookup any more. Note that this is different from * deleting the dentry - d_delete will try to mark the dentry negative if * possible, giving a successful _negative_ lookup, while d_drop will * just make the cache lookup fail. * * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * * __d_drop requires dentry->d_lock * * ___d_drop doesn't mark dentry as "unhashed" * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_drop); static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; /* * Inform d_walk() and shrink_dentry_list() that we are no longer * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been * a normal list member, it didn't matter - ->d_sib.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - * ->d_lock on parent prevents that and since a cursor has no children * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found * using the value left in its ->d_sib.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * * Solution: make sure that the pointer left behind in ->d_sib.next * points to something that won't be moving around. I.e. skip the * cursors. */ while (dentry->d_sib.next) { next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; dentry->d_sib.next = next->d_sib.next; } } static struct dentry *__dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; /* * The dentry is now unrecoverably dead to the world. */ lockref_mark_dead(&dentry->d_lockref); /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); if (dentry->d_inode) dentry_unlink_inode(dentry); else spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); cond_resched(); /* now that it's negative, ->d_parent is stable */ if (!IS_ROOT(dentry)) { parent = dentry->d_parent; spin_lock(&parent->d_lock); } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry_unlist(dentry); if (dentry->d_flags & DCACHE_SHRINK_LIST) can_free = false; spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); if (parent && --parent->d_lockref.count) { spin_unlock(&parent->d_lock); return NULL; } return parent; } /* * Lock a dentry for feeding it to __dentry_kill(). * Called under rcu_read_lock() and dentry->d_lock; the former * guarantees that nothing we access will be freed under us. * Note that dentry is *not* protected from concurrent dentry_kill(), * d_delete(), etc. * * Return false if dentry is busy. Otherwise, return true and have * that dentry's inode locked. */ static bool lock_for_kill(struct dentry *dentry) { struct inode *inode = dentry->d_inode; if (unlikely(dentry->d_lockref.count)) return false; if (!inode || likely(spin_trylock(&inode->i_lock))) return true; do { spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); if (likely(inode == dentry->d_inode)) break; spin_unlock(&inode->i_lock); inode = dentry->d_inode; } while (inode); if (likely(!dentry->d_lockref.count)) return true; if (inode) spin_unlock(&inode->i_lock); return false; } /* * Decide if dentry is worth retaining. Usually this is called with dentry * locked; if not locked, we are more limited and might not be able to tell * without a lock. False in this case means "punt to locked path and recheck". * * In case we aren't locked, these predicates are not "stable". However, it is * sufficient that at some point after we dropped the reference the dentry was * hashed and the flags had the proper value. Other dentry users may have * re-gotten a reference to the dentry and change that, but our work is done - * we can leave the dentry around with a zero refcount. */ static inline bool retain_dentry(struct dentry *dentry, bool locked) { unsigned int d_flags; smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); // Unreachable? Nobody would be able to look it up, no point retaining if (unlikely(d_unhashed(dentry))) return false; // Same if it's disconnected if (unlikely(d_flags & DCACHE_DISCONNECTED)) return false; // ->d_delete() might tell us not to bother, but that requires // ->d_lock; can't decide without it if (unlikely(d_flags & DCACHE_OP_DELETE)) { if (!locked || dentry->d_op->d_delete(dentry)) return false; } // Explicitly told not to bother if (unlikely(d_flags & DCACHE_DONTCACHE)) return false; // At this point it looks like we ought to keep it. We also might // need to do something - put it on LRU if it wasn't there already // and mark it referenced if it was on LRU, but not marked yet. // Unfortunately, both actions require ->d_lock, so in lockless // case we'd have to punt rather than doing those. if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { if (!locked) return false; d_lru_add(dentry); } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { if (!locked) return false; dentry->d_flags |= DCACHE_REFERENCED; } return true; } void d_mark_dontcache(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } inode->i_state |= I_DONTCACHE; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); /* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. * In that case refcount is guaranteed to be zero and we have already * decided that it's not worth keeping around. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; /* * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); /* * If the lockref_put_return() failed due to the lock being held * by somebody else, the fast path has failed. We will need to * get the lock, and then check the count again. */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; } dentry->d_lockref.count--; goto locked; } /* * If we weren't the last ref, we're done. */ if (ret) return true; /* * Can we decide that decrement of refcount is all we needed without * taking the lock? There's a very common case when it's all we need - * dentry looks like it ought to be retained and there's nothing else * to do. */ if (retain_dentry(dentry, false)) return true; /* * Either not worth retaining or we can't tell without the lock. * Get the lock, then. We've already decremented the refcount to 0, * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); /* * Did somebody else grab a reference to it in the meantime, and * we're no longer the last user after all? Alternatively, somebody * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ locked: if (dentry->d_lockref.count || retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return true; } return false; } /* * This is dput * * This is complicated by the fact that we do not want to put * dentries that are no longer on any hash chain on the unused * list: we'd much rather just get rid of them immediately. * * However, that implies that we have to traverse the dentry * tree upwards to the parents which might _also_ now be * scheduled for deletion (it may have been only waiting for * its last child to go away). * * This tail recursion is done by hand as we don't want to depend * on the compiler to always get this right (gcc generally doesn't). * Real recursion would eat up our stack space. */ /* * dput - release a dentry * @dentry: dentry to release * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. */ void dput(struct dentry *dentry) { if (!dentry) return; might_sleep(); rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } while (lock_for_kill(dentry)) { rcu_read_unlock(); dentry = __dentry_kill(dentry); if (!dentry) return; if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } rcu_read_lock(); } rcu_read_unlock(); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(dput); static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); d_shrink_add(dentry, list); } } void dput_to_list(struct dentry *dentry, struct list_head *list) { rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } rcu_read_unlock(); to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } repeat: /* * Don't need rcu_dereference because we re-check it was correct under * the lock. */ rcu_read_lock(); ret = dentry->d_parent; spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); rcu_read_unlock(); goto repeat; } rcu_read_unlock(); BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); lockref_get(&alias->d_lockref); return alias; } /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for * * If any aliases exist for the given inode, take and return a * reference for one of them. If no aliases exist, return %NULL. */ struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); de = __d_find_any_alias(inode); spin_unlock(&inode->i_lock); return de; } EXPORT_SYMBOL(d_find_any_alias); static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias; if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); } return NULL; } /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root * of a filesystem, or if the directory was renamed and d_revalidate * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. */ struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; } EXPORT_SYMBOL(d_find_alias); /* * Caller MUST be holding rcu_read_lock() and be guaranteed * that inode won't get freed until rcu_read_unlock(). */ struct dentry *d_find_alias_rcu(struct inode *inode) { struct hlist_head *l = &inode->i_dentry; struct dentry *de = NULL; spin_lock(&inode->i_lock); // ->i_dentry and ->i_rcu are colocated, but the latter won't be // used without having I_FREEING set, which means no aliases left if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { de = hlist_entry(l->first, struct dentry, d_u.d_alias); } else { hlist_for_each_entry(de, l, d_u.d_alias) if (!d_unhashed(de)) break; } } spin_unlock(&inode->i_lock); return de; } /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { LIST_HEAD(dispose); struct dentry *dentry; spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) to_shrink_list(dentry, &dispose); spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim) { do { rcu_read_unlock(); victim = __dentry_kill(victim); rcu_read_lock(); } while (victim && lock_for_kill(victim)); rcu_read_unlock(); if (victim) spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); if (!lock_for_kill(dentry)) { bool can_free; rcu_read_unlock(); d_shrink_del(dentry); can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } d_shrink_del(dentry); shrink_kill(dentry); } } static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; /* * Referenced dentries are still in use. If they have active * counts, just remove them from the LRU. Otherwise give them * another pass through the LRU. */ if (dentry->d_lockref.count) { d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; spin_unlock(&dentry->d_lock); /* * The list move itself will be made by the common LRU code. At * this point, we've dropped the dentry->d_lock but keep the * lru lock. This is safe to do, since every list movement is * protected by the lru lock even if both locks are held. * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * * The only exceptions to this are functions like * shrink_dentry_list, and code that first checks for the * DCACHE_SHRINK_LIST flag. Those are guaranteed to be * operating only with stack provided lists after they are * properly isolated from the main list. It is thus, always a * local access. */ return LRU_ROTATE; } d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock * @sc: shrink control, passed to list_lru_shrink_walk() * * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * * Shrink the dcache for the specified super block. This is used to free * the dcache before unmounting a file system. */ void shrink_dcache_sb(struct super_block *sb) { do { LIST_HEAD(dispose); list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk * @D_WALK_QUIT: quit walk * @D_WALK_NORETRY: quit when retry is needed * @D_WALK_SKIP: skip this dentry and its children */ enum d_walk_ret { D_WALK_CONTINUE, D_WALK_QUIT, D_WALK_NORETRY, D_WALK_SKIP, }; /** * d_walk - walk the dentry tree * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry * * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent, *dentry; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; again: read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); ret = enter(data, this_parent); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: case D_WALK_SKIP: goto out_unlock; case D_WALK_NORETRY: retry = false; break; } repeat: dentry = d_first_child(this_parent); resume: hlist_for_each_entry_from(dentry, d_sib) { if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: spin_unlock(&dentry->d_lock); goto out_unlock; case D_WALK_NORETRY: retry = false; break; case D_WALK_SKIP: spin_unlock(&dentry->d_lock); continue; } if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ rcu_read_lock(); ascend: if (this_parent != parent) { dentry = this_parent; this_parent = dentry->d_parent; spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ hlist_for_each_entry_continue(dentry, d_sib) { if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { rcu_read_unlock(); goto resume; } } goto ascend; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); out_unlock: spin_unlock(&this_parent->d_lock); done_seqretry(&rename_lock, seq); return; rename_retry: spin_unlock(&this_parent->d_lock); rcu_read_unlock(); BUG_ON(seq & 1); if (!retry) return; seq = 1; goto again; } struct check_mount { struct vfsmount *mnt; unsigned int mounted; }; static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; struct path path = { .mnt = info->mnt, .dentry = dentry }; if (likely(!d_mountpoint(dentry))) return D_WALK_CONTINUE; if (__path_is_mountpoint(&path)) { info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * path_has_submounts - check for mounts over a dentry in the * current namespace. * @parent: path to check. * * Return true if the parent or its subdirectories contain * a mount point in the current namespace. */ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; read_seqlock_excl(&mount_lock); d_walk(parent->dentry, &data, path_check_mount); read_sequnlock_excl(&mount_lock); return data.mounted; } EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is * reachable (e.g. NFS can unhash a directory dentry and then the complete * subtree can become unreachable). * * Only one of d_invalidate() and d_set_mounted() must succeed. For * this reason take rename_lock and d_lock on dentry and ancestors. */ int d_set_mounted(struct dentry *dentry) { struct dentry *p; int ret = -ENOENT; write_seqlock(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { /* Need exclusion wrt. d_invalidate() */ spin_lock(&p->d_lock); if (unlikely(d_unhashed(p))) { spin_unlock(&p->d_lock); goto out; } spin_unlock(&p->d_lock); } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { ret = -EBUSY; if (!d_mountpoint(dentry)) { dentry->d_flags |= DCACHE_MOUNTED; ret = 0; } } spin_unlock(&dentry->d_lock); out: write_sequnlock(&rename_lock); return ret; } /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level * whenever the d_children list is non-empty and continue * searching. * * It returns zero iff there are no unused children, * otherwise it returns the number of children moved to * the end of the unused list. This may not be the total * number of unused children, because select_parent can * drop the lock and return early due to latency * constraints. */ struct select_data { struct dentry *start; union { long found; struct dentry *victim; }; struct list_head dispose; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; } else if (!dentry->d_lockref.count) { to_shrink_list(dentry, &data->dispose); data->found++; } else if (dentry->d_lockref.count < 0) { data->found++; } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (!dentry->d_lockref.count) { if (dentry->d_flags & DCACHE_SHRINK_LIST) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } to_shrink_list(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } /** * shrink_dcache_parent - prune dcache * @parent: parent of entries to prune * * Prune the dcache to remove unused children of the parent dentry. */ void shrink_dcache_parent(struct dentry *parent) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); d_walk(parent, &data, select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); continue; } cond_resched(); if (!data.found) break; data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { spin_lock(&data.victim->d_lock); if (!lock_for_kill(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { shrink_kill(data.victim); } } if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ if (!hlist_empty(&dentry->d_children)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); return D_WALK_CONTINUE; } static void do_one_tree(struct dentry *dentry) { shrink_dcache_parent(dentry); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } /* * destroy the dentries attached to a superblock on unmounting */ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; rwsem_assert_held_write(&sb->s_umount); dentry = sb->s_root; sb->s_root = NULL; do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_roots)) { dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); do_one_tree(dentry); } } static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { *victim = dget_dlock(dentry); return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) */ void d_invalidate(struct dentry *dentry) { bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ if (!dentry->d_inode) return; shrink_dcache_parent(dentry); for (;;) { struct dentry *victim = NULL; d_walk(dentry, &victim, find_submount); if (!victim) { if (had_submounts) shrink_dcache_parent(dentry); return; } had_submounts = true; detach_mounts(victim); dput(victim); } } EXPORT_SYMBOL(d_invalidate); /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; int err; dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru, GFP_KERNEL); if (!dentry) return NULL; /* * We guarantee that the inline name is always NUL-terminated. * This way the memcpy() done by the name switching in rename * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (unlikely(!name)) { name = &slash_name; dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } atomic_set(&p->u.count, 1); dname = p->name; } else { dname = dentry->d_iname; } dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_lockref.count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_HLIST_NODE(&dentry->d_sib); d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { err = dentry->d_op->d_init(dentry); if (err) { if (dname_external(dentry)) kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); return NULL; } } this_cpu_inc(nr_dentry); return dentry; } /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject * to concurrency here */ dentry->d_parent = dget_dlock(parent); hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); return dentry; } EXPORT_SYMBOL(d_alloc); struct dentry *d_alloc_anon(struct super_block *sb) { return __d_alloc(sb, NULL); } EXPORT_SYMBOL(d_alloc_anon); struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } return dentry; } /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock * @name: qstr of the name * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. * This is used for pipes, sockets et.al. - the stuff that should * never be anyone's children or parents. Unlike all other * dentries, these will not have RCU delay between dropping the * last reference and freeing them. * * The only user is alloc_file_pseudo() and that's what should * be considered a public interface. Don't use directly. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; struct dentry *dentry = __d_alloc(sb, name); if (likely(dentry)) { dentry->d_flags |= DCACHE_NORCU; if (!sb->s_d_op) d_set_d_op(dentry, &anon_ops); } return dentry; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; q.name = name; q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) { WARN_ON_ONCE(dentry->d_op); WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_REAL)); dentry->d_op = op; if (!op) return; if (op->d_hash) dentry->d_flags |= DCACHE_OP_HASH; if (op->d_compare) dentry->d_flags |= DCACHE_OP_COMPARE; if (op->d_revalidate) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_weak_revalidate) dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL; } EXPORT_SYMBOL(d_set_d_op); static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; if (S_ISDIR(inode->i_mode)) { add_flags = DCACHE_DIRECTORY_TYPE; if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { if (unlikely(!inode->i_op->lookup)) add_flags = DCACHE_AUTODIR_TYPE; else inode->i_opflags |= IOP_LOOKUP; } goto type_determined; } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { if (unlikely(inode->i_op->get_link)) { add_flags = DCACHE_SYMLINK_TYPE; goto type_determined; } inode->i_opflags |= IOP_NOFOLLOW; } if (unlikely(!S_ISREG(inode->i_mode))) add_flags = DCACHE_SPECIAL_TYPE; type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; return add_flags; } static void __d_instantiate(struct dentry *dentry, struct inode *inode) { unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); /* * The negative counter only tracks dentries on the LRU. Don't dec if * d_lru is on another list. */ if ((dentry->d_flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); spin_unlock(&dentry->d_lock); } /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete * @inode: inode to attach to this dentry * * Fill in inode information in the entry. * * This turns negative dentries into productive full members * of society. * * NOTE! This assumes that the inode count has been incremented * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_instantiate); /* * This should be equivalent to d_instantiate() + unlock_new_inode(), * with lockdep-related part of unlock_new_inode() done before * anything else. Use that instead of open-coding d_instantiate()/ * unlock_new_inode() combinations. */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { res = d_alloc_anon(root_inode->i_sb); if (res) d_instantiate(res, root_inode); else iput(root_inode); } return res; } EXPORT_SYMBOL(d_make_root); static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { struct super_block *sb; struct dentry *new, *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); sb = inode->i_sb; res = d_find_any_alias(inode); /* existing alias? */ if (res) goto out; new = d_alloc_anon(sb); if (!new) { res = ERR_PTR(-ENOMEM); goto out; } security_d_instantiate(new, inode); spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); /* recheck under lock */ if (likely(!res)) { /* still no alias, attach a disconnected dentry */ unsigned add_flags = d_flags_for_inode(inode); if (disconnected) add_flags |= DCACHE_DISCONNECTED; spin_lock(&new->d_lock); __d_set_inode_and_type(new, inode, add_flags); hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); if (!disconnected) { hlist_bl_lock(&sb->s_roots); hlist_bl_add_head(&new->d_hash, &sb->s_roots); hlist_bl_unlock(&sb->s_roots); } spin_unlock(&new->d_lock); spin_unlock(&inode->i_lock); inode = NULL; /* consumed by new->d_inode */ res = new; } else { spin_unlock(&inode->i_lock); dput(new); } out: iput(inode); return res; } /** * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain a dentry for an inode resulting from NFS filehandle conversion or * similar open by handle operations. The returned dentry may be anonymous, * or may have a full name (if the inode was already in the cache). * * When called on a directory inode, we must ensure that the inode only ever * has one dentry. If a dentry is found, that is returned instead of * allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is released. * To make it easier to use in export operations a %NULL or IS_ERR inode may * be passed in and the error will be propagated to the return value, * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_alias(struct inode *inode) { return __d_obtain_alias(inode, true); } EXPORT_SYMBOL(d_obtain_alias); /** * d_obtain_root - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain an IS_ROOT dentry for the root of a filesystem. * * We must ensure that directory inodes only ever have one dentry. If a * dentry is found, that is returned instead of allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is * released. A %NULL or IS_ERR inode may be passed in and will be the * error will be propagate to the return value, with a %NULL @inode * replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_root(struct inode *inode) { return __d_obtain_alias(inode, false); } EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name * @dentry: the negative dentry that was passed to the parent's lookup func * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * * For a case-insensitive lookup match and if the case-exact dentry * already exists in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { struct dentry *found, *res; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); if (found) { iput(inode); return found; } if (d_in_lookup(dentry)) { found = d_alloc_parallel(dentry->d_parent, name, dentry->d_wait); if (IS_ERR(found) || !d_in_lookup(found)) { iput(inode); return found; } } else { found = d_alloc(dentry->d_parent, name); if (!found) { iput(inode); return ERR_PTR(-ENOMEM); } } res = d_splice_alias(inode, found); if (res) { d_lookup_done(found); dput(found); return res; } return found; } EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name * @dentry: the negative dentry that was passed to the parent's lookup func * @parent: parent dentry * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false */ bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name) { if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { if (dentry->d_name.len != name->len) return false; return dentry_cmp(dentry, name->name, name->len) == 0; } return parent->d_op->d_compare(dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } EXPORT_SYMBOL_GPL(d_same_name); /* * This is __d_lookup_rcu() when the parent dentry has * DCACHE_OP_COMPARE, which makes things much nastier. */ static noinline struct dentry *__d_lookup_rcu_op_compare( const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { int tlen; const char *tname; unsigned seq; seqretry: seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; tlen = dentry->d_name.len; tname = dentry->d_name.name; /* we want a consistent (name,len) pair */ if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); goto seqretry; } if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name * resolution (store-free path walking) design described in * Documentation/filesystems/path-lookup.txt. * * This is not to be used outside core vfs. * * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock * held, and rcu_read_lock held. The returned dentry must not be stored into * without taking d_lock and checking d_seq sequence count against @seq * returned here. * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. * * NOTE! The caller *has* to check the resulting dentry against the sequence * number we've returned before using any of the resulting dentry state! */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) return __d_lookup_rcu_op_compare(parent, name, seqp); /* * The hash list is protected using RCU. * * Carefully use d_seq when comparing a candidate dentry, to avoid * races with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it * is in the middle of a sequence change. If we do the slow * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. * * Note that raw_seqcount_begin still *does* smp_rmb(), so * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash_len != hashlen) continue; if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * d_lookup searches the children of the parent dentry for the name in * question. If the dentry is found its reference count is incremented and the * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; do { seq = read_seqbegin(&rename_lock); dentry = __d_lookup(parent, name); if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; } EXPORT_SYMBOL(d_lookup); /** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * __d_lookup is like d_lookup, however it may (rarely) return a * false-negative result due to unrelated rename activity. * * __d_lookup is slightly faster by avoiding rename_lock read seqlock, * however it must be used carefully, eg. with a following d_lookup in * the case of failure. * * __d_lookup callers must be commented. */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int hash = name->hash; struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races * with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { if (dentry->d_name.hash != hash) continue; spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; if (d_unhashed(dentry)) goto next; if (!d_same_name(dentry, parent, name)) goto next; dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; next: spin_unlock(&dentry->d_lock); } rcu_read_unlock(); return found; } /** * d_hash_and_lookup - hash the qstr then search for a dentry * @dir: Directory to search in * @name: qstr of name we wish to find * * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) */ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) { /* * Check for a fs-specific hash function. Note that we must * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } return d_lookup(dir, name); } EXPORT_SYMBOL(d_hash_and_lookup); /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry * - unhash this dentry and free it. * * Usually, we want to just turn this into * a negative dentry, but if anybody else is * currently using the dentry or the inode * we can't do that and we fall back on removing * it from the hash queues and waiting for * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * * Turn the dentry into a negative dentry if possible, otherwise * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) { struct inode *inode = dentry->d_inode; spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); /* * Are we the only user? */ if (dentry->d_lockref.count == 1) { if (dentry_negative_policy) __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { struct hlist_bl_head *b = d_hash(entry->d_name.hash); hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash * * Adds a dentry to the hash according to its name. */ void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); __d_rehash(entry); spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) return n; cpu_relax(); } } static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) { if (d_in_lookup(dentry)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(dentry->d_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&dentry->d_lock); schedule(); spin_lock(&dentry->d_lock); } while (d_in_lookup(dentry)); } } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, wait_queue_head_t *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; struct dentry *new = d_alloc(parent, name); struct dentry *dentry; unsigned seq, r_seq, d_seq; if (unlikely(!new)) return ERR_PTR(-ENOMEM); retry: rcu_read_lock(); seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } if (read_seqcount_retry(&dentry->d_seq, d_seq)) { rcu_read_unlock(); dput(dentry); goto retry; } rcu_read_unlock(); dput(new); return dentry; } if (unlikely(read_seqretry(&rename_lock, r_seq))) { rcu_read_unlock(); goto retry; } if (unlikely(seq & 1)) { rcu_read_unlock(); goto retry; } hlist_bl_lock(b); if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; } /* * No changes for the parent since the beginning of d_lookup(). * Since all removals from the chain happen with hlist_bl_lock(), * any potential in-lookup matches are going to stay here until * we unlock the chain. All fields are stable in everything * we encounter. */ hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) continue; if (!d_same_name(dentry, parent, name)) continue; hlist_bl_unlock(b); /* now we can try to grab a reference */ if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } rcu_read_unlock(); /* * somebody is likely to be still doing lookup for it; * wait for them to finish */ spin_lock(&dentry->d_lock); d_wait_lookup(dentry); /* * it's not in-lookup anymore; in principle we should repeat * everything from dcache lookup, but it's likely to be what * d_lookup() would've found anyway. If it is, just return it; * otherwise we really have to repeat the whole thing. */ if (unlikely(dentry->d_name.hash != hash)) goto mismatch; if (unlikely(dentry->d_parent != parent)) goto mismatch; if (unlikely(d_unhashed(dentry))) goto mismatch; if (unlikely(!d_same_name(dentry, parent, name))) goto mismatch; /* OK, it *is* a hashed match; return it */ spin_unlock(&dentry->d_lock); dput(new); return dentry; } rcu_read_unlock(); /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: spin_unlock(&dentry->d_lock); dput(dentry); goto retry; } EXPORT_SYMBOL(d_alloc_parallel); /* * - Unhash the dentry * - Retrieve and clear the waitqueue head in dentry * - Return the waitqueue head */ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) { wait_queue_head_t *d_wait; struct hlist_bl_head *b; lockdep_assert_held(&dentry->d_lock); b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } void __d_lookup_unhash_wake(struct dentry *dentry) { spin_lock(&dentry->d_lock); wake_up_all(__d_lookup_unhash(dentry)); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(__d_lookup_unhash_wake); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode) { wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(dentry); } if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); } __d_rehash(dentry); if (dir) end_dir_add(dir, n, d_wait); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); } /** * d_add - add dentry to hash queues * @entry: dentry to add * @inode: The inode to attach to this dentry * * This adds the entry to the hash queues and initializes @inode. * The entry was actually filled in earlier during d_alloc(). */ void d_add(struct dentry *entry, struct inode *inode) { if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); } __d_add(entry, inode); } EXPORT_SYMBOL(d_add); /** * d_exact_alias - find and hash an exact unhashed alias * @entry: dentry to add * @inode: The inode to go with this dentry * * If an unhashed dentry with the same name/parent and desired * inode already exists, hash and return it. Otherwise, return * NULL. * * Parent directory should be locked. */ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) { struct dentry *alias; unsigned int hash = entry->d_name.hash; spin_lock(&inode->i_lock); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or * parent changes, because the parent inode i_mutex is held. */ if (alias->d_name.hash != hash) continue; if (alias->d_parent != entry->d_parent) continue; if (!d_same_name(alias, entry->d_parent, &entry->d_name)) continue; spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { spin_unlock(&alias->d_lock); alias = NULL; } else { dget_dlock(alias); __d_rehash(alias); spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); return alias; } spin_unlock(&inode->i_lock); return NULL; } EXPORT_SYMBOL(d_exact_alias); static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { if (unlikely(dname_external(dentry))) { /* * Both external: swap the pointers */ swap(target->d_name.name, dentry->d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ memcpy(target->d_iname, dentry->d_name.name, dentry->d_name.len + 1); dentry->d_name.name = target->d_name.name; target->d_name.name = target->d_iname; } } else { if (unlikely(dname_external(dentry))) { /* * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); target->d_name.name = dentry->d_name.name; dentry->d_name.name = dentry->d_iname; } else { /* * Both are internal. */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); } } } swap(dentry->d_name.hash_len, target->d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) { struct external_name *old_name = NULL; if (unlikely(dname_external(dentry))) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->u.count); dentry->d_name = target->d_name; } else { memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); dentry->d_name.name = dentry->d_iname; dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) kfree_rcu(old_name, u.head); } /* * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; WARN_ON(!dentry->d_inode); if (WARN_ON(dentry == target)) return; BUG_ON(d_ancestor(target, dentry)); old_parent = dentry->d_parent; p = d_ancestor(old_parent, target); if (IS_ROOT(dentry)) { BUG_ON(p); spin_lock(&target->d_parent->d_lock); } else if (!p) { /* target is not a descendent of dentry->d_parent */ spin_lock(&target->d_parent->d_lock); spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); } else { BUG_ON(p == dentry); spin_lock(&old_parent->d_lock); if (p != target) spin_lock_nested(&target->d_parent->d_lock, DENTRY_D_LOCK_NESTED); } spin_lock_nested(&dentry->d_lock, 2); spin_lock_nested(&target->d_lock, 3); if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(target); } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* unhash both */ if (!d_unhashed(dentry)) ___d_drop(dentry); if (!d_unhashed(target)) ___d_drop(target); /* ... and switch them in the tree */ dentry->d_parent = target->d_parent; if (!exchange) { copy_name(dentry, target); target->d_hash.pprev = NULL; dentry->d_parent->d_lockref.count++; if (dentry != old_parent) /* wasn't IS_ROOT */ WARN_ON(!--old_parent->d_lockref.count); } else { target->d_parent = old_parent; swap_names(dentry, target); if (!hlist_unhashed(&target->d_sib)) __hlist_del(&target->d_sib); hlist_add_head(&target->d_sib, &target->d_parent->d_children); __d_rehash(target); fsnotify_update_flags(target); } if (!hlist_unhashed(&dentry->d_sib)) __hlist_del(&dentry->d_sib); hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); if (dir) end_dir_add(dir, n, d_wait); if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); if (dentry != old_parent) spin_unlock(&old_parent->d_lock); spin_unlock(&target->d_lock); spin_unlock(&dentry->d_lock); } /* * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. See the locking * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); /* * d_exchange - exchange two dentries * @dentry1: first dentry * @dentry2: second dentry */ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) { write_seqlock(&rename_lock); WARN_ON(!dentry1->d_inode); WARN_ON(!dentry2->d_inode); WARN_ON(IS_ROOT(dentry1)); WARN_ON(IS_ROOT(dentry2)); __d_move(dentry1, dentry2, true); write_sequnlock(&rename_lock); } /** * d_ancestor - search for an ancestor * @p1: ancestor dentry * @p2: child dentry * * Returns the ancestor dentry of p2 which is a child of p1, if p1 is * an ancestor of p2, else NULL. */ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) { struct dentry *p; for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1) return p; } return NULL; } /* * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static int __d_unalias(struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) goto out_unalias; /* See lock_rename() */ if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: if (m2) up_read(m2); if (m1) mutex_unlock(m1); return ret; } /** * d_splice_alias - splice a disconnected dentry into the tree if one exists * @inode: the inode which may have a disconnected dentry * @dentry: a negative dentry which we want to point to the inode. * * If inode is a directory and has an IS_ROOT alias, then d_move that in * place of the given dentry and return it, else simply d_add the inode * to the dentry and return NULL. * * If a non-IS_ROOT directory is found, the filesystem is corrupt, and * we should error out: directories can't have multiple aliases. * * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * * Cluster filesystems may call this function with a negative, hashed dentry. * In that case, we know that the inode will be a regular file, and also this * will only occur during atomic_open. So we need to check for the dentry * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { if (IS_ERR(inode)) return ERR_CAST(inode); BUG_ON(!d_unhashed(dentry)); if (!inode) goto out; security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { /* The reference to new ensures it remains an alias */ spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( "VFS: Lookup of '%s' in %s %s" " would have caused loop\n", dentry->d_name.name, inode->i_sb->s_type->name, inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); int err = __d_unalias(dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); new = ERR_PTR(err); } dput(old_parent); } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); } iput(inode); return new; } } out: __d_add(dentry, inode); return NULL; } EXPORT_SYMBOL(d_splice_alias); /* * Test whether new_dentry is a subdirectory of old_dentry. * * Trivially implemented using the dcache structure */ /** * is_subdir - is new dentry a subdirectory of old_dentry * @new_dentry: new dentry * @old_dentry: old dentry * * Returns true if new_dentry is a subdirectory of the parent (at any depth). * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { bool subdir; unsigned seq; if (new_dentry == old_dentry) return true; /* Access d_parent under rcu as d_move() may change it. */ rcu_read_lock(); seq = read_seqbegin(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); /* Try lockless once... */ if (read_seqretry(&rename_lock, seq)) { /* ...else acquire lock for progress even on deep chains. */ read_seqlock_excl(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); read_sequnlock_excl(&rename_lock); } rcu_read_unlock(); return subdir; } EXPORT_SYMBOL(is_subdir); static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { struct dentry *root = data; if (dentry != root) { if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } } return D_WALK_CONTINUE; } void d_genocide(struct dentry *parent) { d_walk(parent, parent, d_genocide_kill); } void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; BUG_ON(dentry->d_name.name != dentry->d_iname || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); } EXPORT_SYMBOL(d_mark_tmpfile); void d_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; inode_dec_link_count(inode); d_mark_tmpfile(file, inode); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); /* * Obtain inode number of the parent dentry. */ ino_t d_parent_ino(struct dentry *dentry) { struct dentry *parent; struct inode *iparent; unsigned seq; ino_t ret; scoped_guard(rcu) { seq = raw_seqcount_begin(&dentry->d_seq); parent = READ_ONCE(dentry->d_parent); iparent = d_inode_rcu(parent); if (likely(iparent)) { ret = iparent->i_ino; if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; } } spin_lock(&dentry->d_lock); ret = dentry->d_parent->d_inode->i_ino; spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(d_parent_ino); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { if (!str) return 0; dhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_EARLY | HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } static void __init dcache_init(void) { /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature * of the dcache. */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_iname); /* Hash may have been set up in dcache_init_early */ if (!hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __ro_after_init; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) { int i; for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); dcache_init_early(); inode_init_early(); } void __init vfs_caches_init(void) { names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); dcache_init(); inode_init(); files_init(); files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); }
4980 4777 4953 109 3159 196 2924 2920 1766 2918 4972 4969 4962 4227 8453 7164 4962 14 106 4970 4452 225 141 4962 226 3105 4776 3987 4558 4970 3110 1946 3112 4778 4778 4778 4794 4781 4235 3994 2958 100 2924 2961 321 2921 4775 4790 4789 297 4778 3120 3104 3103 3112 3112 2991 3112 3114 214 60 60 4783 4776 172 297 297 3125 3114 76 76 6 8 3 11 8 8 1111 1 8 9 4 6 2 2105 6029 2105 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H #include <linux/atomic.h> #include <linux/huge_mm.h> #include <linux/mm_types.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> #include <linux/swapops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? * @folio: The folio to test. * * We would like to get this info without a page flag, but the state * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. * * Return: An integer (not a boolean!) used to sort a folio onto the * right LRU list and to account folios correctly. * 1 if @folio is a regular filesystem backed page cache folio * or a lazily freed anonymous folio (e.g. via MADV_FREE). * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ static inline int folio_is_file_lru(struct folio *folio) { return !folio_test_swapbacked(folio); } static inline int page_is_file_lru(struct page *page) { return folio_is_file_lru(page_folio(page)); } static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } /** * __folio_clear_lru_flags - Clear page lru flags before releasing a page. * @folio: The folio that was on lru and now has a zero reference. */ static __always_inline void __folio_clear_lru_flags(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ if (folio_test_active(folio) && folio_test_unevictable(folio)) return; __folio_clear_active(folio); __folio_clear_unevictable(folio); } /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. * * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); } #else static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); } #endif static inline bool lru_gen_in_fault(void) { return current->in_lru_fault; } static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; } static inline int lru_hist_from_seq(unsigned long seq) { return seq % NR_HIST_GENS; } static inline int lru_tier_from_refs(int refs) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); /* see the comment in folio_lru_refs() */ return order_base_2(refs + 1); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); bool workingset = flags & BIT(PG_workingset); /* * Return the number of accesses beyond PG_referenced, i.e., N-1 if the * total number of accesses is N>1, since N=0,1 both map to the first * tier. lru_tier_from_refs() will account for this off-by-one. Also see * the comment on MAX_NR_TIERS. */ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; } static inline void folio_clear_lru_refs(struct folio *folio) { set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); } static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on MIN_NR_GENS */ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); } static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); if (old_gen >= 0) WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], lrugen->nr_pages[old_gen][type][zone] - delta); if (new_gen >= 0) WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], lrugen->nr_pages[new_gen][type][zone] + delta); /* addition */ if (old_gen < 0) { if (lru_gen_is_active(lruvec, new_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); return; } /* deletion */ if (new_gen < 0) { if (lru_gen_is_active(lruvec, old_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, -delta); return; } /* promotion */ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { __update_lru_size(lruvec, lru, zone, -delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); } /* demotion requires isolation, e.g., lru_deactivate_fn() */ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); if (folio_test_unevictable(folio) || !lrugen->enabled) return false; /* * There are four common cases for this page: * 1. If it's hot, i.e., freshly faulted in, add it to the youngest * generation, and it's protected over the rest below. * 2. If it can't be evicted immediately, i.e., a dirty page pending * writeback, add it to the second youngest generation. * 3. If it should be evicted first, e.g., cold and clean from * folio_rotate_reclaimable(), add it to the oldest generation. * 4. Everything else falls between 2 & 3 above and is added to the * second oldest generation if it's considered inactive, or the * oldest generation otherwise. See lru_gen_is_active(). */ if (folio_test_active(folio)) seq = lrugen->max_seq; else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) seq = lrugen->max_seq - 1; else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) seq = lrugen->min_seq[type]; else seq = lrugen->min_seq[type] + 1; gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ mask = LRU_GEN_MASK; /* * Don't clear PG_workingset here because it can affect PSI accounting * if the activation is due to workingset refault. */ if (folio_test_active(folio)) mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); set_mask_bits(&folio->flags, mask, flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long flags; int gen = folio_lru_gen(folio); if (gen < 0) return false; VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); list_del(&folio->lru); return true; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; set_mask_bits(&new->flags, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) { return false; } static inline bool lru_gen_in_fault(void) { return false; } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { } #endif /* CONFIG_LRU_GEN */ static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, false)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, true)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_del_folio(lruvec, folio, false)) return; if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } #ifdef CONFIG_ANON_VMA_NAME /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { if (anon_name) kref_get(&anon_name->kref); } static inline void anon_vma_name_put(struct anon_vma_name *anon_name) { if (anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } static inline struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) { /* Prevent anon_name refcount saturation early on */ if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { anon_vma_name_get(anon_name); return anon_name; } return anon_vma_name_alloc(anon_name->name); } static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); if (anon_name) new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) { /* * Not using anon_vma_name because it generates a warning if mmap_lock * is not held, which might be the case here. */ anon_vma_name_put(vma->anon_name); } static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { if (anon_name1 == anon_name2) return true; return anon_name1 && anon_name2 && !strcmp(anon_name1->name, anon_name2->name); } #else /* CONFIG_ANON_VMA_NAME */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) {} static inline void free_anon_vma_name(struct vm_area_struct *vma) {} static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { return true; } #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); } static inline void inc_tlb_flush_pending(struct mm_struct *mm) { atomic_inc(&mm->tlb_flush_pending); /* * The only time this value is relevant is when there are indeed pages * to flush. And we'll only flush pages after changing them, which * requires the PTL. * * So the ordering here is: * * atomic_inc(&mm->tlb_flush_pending); * spin_lock(&ptl); * ... * set_pte_at(); * spin_unlock(&ptl); * * spin_lock(&ptl) * mm_tlb_flush_pending(); * .... * spin_unlock(&ptl); * * flush_tlb_range(); * atomic_dec(&mm->tlb_flush_pending); * * Where the increment if constrained by the PTL unlock, it thus * ensures that the increment is visible if the PTE modification is * visible. After all, if there is no PTE modification, nobody cares * about TLB flushes either. * * This very much relies on users (mm_tlb_flush_pending() and * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc * locks (PPC) the unlock of one doesn't order against the lock of * another PTL. * * The decrement is ordered by the flush_tlb_range(), such that * mm_tlb_flush_pending() will not return false unless all flushes have * completed. */ } static inline void dec_tlb_flush_pending(struct mm_struct *mm) { /* * See inc_tlb_flush_pending(). * * This cannot be smp_mb__before_atomic() because smp_mb() simply does * not order against TLB invalidate completion, which is what we need. * * Therefore we must rely on tlb_flush_*() to guarantee order. */ atomic_dec(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that * PTLs release and therefore ensures that if we observe the modified * PTE we must also observe the increment from inc_tlb_flush_pending(). * * That is, it only guarantees to return true if there is a flush * pending for _this_ PTL. */ return atomic_read(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_nested(struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL * for which there is a TLB flush pending in order to guarantee * we've seen both that PTE modification and the increment. * * (no requirement on actually still holding the PTL, that is irrelevant) */ return atomic_read(&mm->tlb_flush_pending) > 1; } #ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( swp_entry_t entry, struct vm_area_struct *dst_vma) { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) dstm |= PTE_MARKER_UFFD_WP; return dstm; } #endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already * cleared so we will never accidentally replace something valuable. Meanwhile * none pte also means we are not demoting the pte so tlb flushed is not needed. * E.g., when pte cleared the caller should have taken care of the tlb flush. * * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. */ static inline void pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { #ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole * thing, because when zapping either it means it's dropping the * page, or in TTU where the present pte will be quickly replaced * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) arm_uffd_pte = true; /* * A uffd-wp wr-protected swap pte. Note: this should even cover an * existing pte marker with uffd-wp bit set. */ if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; if (unlikely(arm_uffd_pte)) set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); #endif } static inline bool vma_has_recency(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) return false; return true; } #endif
13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 /* SPDX-License-Identifier: GPL-2.0-only */ /* * fence-chain: chain fences together in a timeline * * Copyright (C) 2018 Advanced Micro Devices, Inc. * Authors: * Christian König <christian.koenig@amd.com> */ #ifndef __LINUX_DMA_FENCE_CHAIN_H #define __LINUX_DMA_FENCE_CHAIN_H #include <linux/dma-fence.h> #include <linux/irq_work.h> #include <linux/slab.h> /** * struct dma_fence_chain - fence to represent an node of a fence chain * @base: fence base class * @prev: previous fence of the chain * @prev_seqno: original previous seqno before garbage collection * @fence: encapsulated fence * @lock: spinlock for fence handling */ struct dma_fence_chain { struct dma_fence base; struct dma_fence __rcu *prev; u64 prev_seqno; struct dma_fence *fence; union { /** * @cb: callback for signaling * * This is used to add the callback for signaling the * complection of the fence chain. Never used at the same time * as the irq work. */ struct dma_fence_cb cb; /** * @work: irq work item for signaling * * Irq work structure to allow us to add the callback without * running into lock inversion. Never used at the same time as * the callback. */ struct irq_work work; }; spinlock_t lock; }; /** * to_dma_fence_chain - cast a fence to a dma_fence_chain * @fence: fence to cast to a dma_fence_array * * Returns NULL if the fence is not a dma_fence_chain, * or the dma_fence_chain otherwise. */ static inline struct dma_fence_chain * to_dma_fence_chain(struct dma_fence *fence) { if (!fence || !dma_fence_is_chain(fence)) return NULL; return container_of(fence, struct dma_fence_chain, base); } /** * dma_fence_chain_contained - return the contained fence * @fence: the fence to test * * If the fence is a dma_fence_chain the function returns the fence contained * inside the chain object, otherwise it returns the fence itself. */ static inline struct dma_fence * dma_fence_chain_contained(struct dma_fence *fence) { struct dma_fence_chain *chain = to_dma_fence_chain(fence); return chain ? chain->fence : fence; } /** * dma_fence_chain_alloc * * Returns a new struct dma_fence_chain object or NULL on failure. * * This specialized allocator has to be a macro for its allocations to be * accounted separately (to have a separate alloc_tag). The typecast is * intentional to enforce typesafety. */ #define dma_fence_chain_alloc() \ ((struct dma_fence_chain *)kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL)) /** * dma_fence_chain_free * @chain: chain node to free * * Frees up an allocated but not used struct dma_fence_chain object. This * doesn't need an RCU grace period since the fence was never initialized nor * published. After dma_fence_chain_init() has been called the fence must be * released by calling dma_fence_put(), and not through this function. */ static inline void dma_fence_chain_free(struct dma_fence_chain *chain) { kfree(chain); }; /** * dma_fence_chain_for_each - iterate over all fences in chain * @iter: current fence * @head: starting point * * Iterate over all fences in the chain. We keep a reference to the current * fence while inside the loop which must be dropped when breaking out. * * For a deep dive iterator see dma_fence_unwrap_for_each(). */ #define dma_fence_chain_for_each(iter, head) \ for (iter = dma_fence_get(head); iter; \ iter = dma_fence_chain_walk(iter)) struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence); int dma_fence_chain_find_seqno(struct dma_fence **pfence, uint64_t seqno); void dma_fence_chain_init(struct dma_fence_chain *chain, struct dma_fence *prev, struct dma_fence *fence, uint64_t seqno); #endif /* __LINUX_DMA_FENCE_CHAIN_H */
13 13 13 13 13 13 13 18 18 18 18 12 3 1 7 5 5 13 13 13 11 11 2 2 13 13 6 6 7 7 7 7 7 13 11 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 // SPDX-License-Identifier: GPL-2.0 /* * xfrm_input.c * * Changes: * YOSHIFUJI Hideaki @USAGI * Split up af-specific portion * */ #include <linux/bottom_half.h> #include <linux/cache.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/percpu.h> #include <net/dst.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> #include <net/dst_metadata.h> #include <net/hotdata.h> #include "xfrm_inout.h" struct xfrm_trans_tasklet { struct work_struct work; spinlock_t queue_lock; struct sk_buff_head queue; }; struct xfrm_trans_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; int (*finish)(struct net *net, struct sock *sk, struct sk_buff *skb); struct net *net; }; #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[2][AF_INET6 + 1]; static struct gro_cells gro_cells; static struct net_device xfrm_napi_dev; static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet); int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; if (WARN_ON(afinfo->family > AF_INET6)) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_input_afinfo_lock); if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) err = -EEXIST; else rcu_assign_pointer(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], afinfo); spin_unlock_bh(&xfrm_input_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_input_register_afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo) { int err = 0; spin_lock_bh(&xfrm_input_afinfo_lock); if (likely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family])) { if (unlikely(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family] != afinfo)) err = -EINVAL; else RCU_INIT_POINTER(xfrm_input_afinfo[afinfo->is_ipip][afinfo->family], NULL); } spin_unlock_bh(&xfrm_input_afinfo_lock); synchronize_rcu(); return err; } EXPORT_SYMBOL(xfrm_input_unregister_afinfo); static const struct xfrm_input_afinfo *xfrm_input_get_afinfo(u8 family, bool is_ipip) { const struct xfrm_input_afinfo *afinfo; if (WARN_ON_ONCE(family > AF_INET6)) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_input_afinfo[is_ipip][family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, int err) { bool is_ipip = (protocol == IPPROTO_IPIP || protocol == IPPROTO_IPV6); const struct xfrm_input_afinfo *afinfo; int ret; afinfo = xfrm_input_get_afinfo(family, is_ipip); if (!afinfo) return -EAFNOSUPPORT; ret = afinfo->callback(skb, protocol, err); rcu_read_unlock(); return ret; } struct sec_path *secpath_set(struct sk_buff *skb) { struct sec_path *sp, *tmp = skb_ext_find(skb, SKB_EXT_SEC_PATH); sp = skb_ext_add(skb, SKB_EXT_SEC_PATH); if (!sp) return NULL; if (tmp) /* reused existing one (was COW'd if needed) */ return sp; /* allocated new secpath */ memset(sp->ovec, 0, sizeof(sp->ovec)); sp->olen = 0; sp->len = 0; sp->verified_cnt = 0; return sp; } EXPORT_SYMBOL(secpath_set); /* Fetch spi and seq from ipsec header */ int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq) { int offset, offset_seq; int hlen; switch (nexthdr) { case IPPROTO_AH: hlen = sizeof(struct ip_auth_hdr); offset = offsetof(struct ip_auth_hdr, spi); offset_seq = offsetof(struct ip_auth_hdr, seq_no); break; case IPPROTO_ESP: hlen = sizeof(struct ip_esp_hdr); offset = offsetof(struct ip_esp_hdr, spi); offset_seq = offsetof(struct ip_esp_hdr, seq_no); break; case IPPROTO_COMP: if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr))) return -EINVAL; *spi = htonl(ntohs(*(__be16 *)(skb_transport_header(skb) + 2))); *seq = 0; return 0; default: return 1; } if (!pskb_may_pull(skb, hlen)) return -EINVAL; *spi = *(__be32 *)(skb_transport_header(skb) + offset); *seq = *(__be32 *)(skb_transport_header(skb) + offset_seq); return 0; } EXPORT_SYMBOL(xfrm_parse_spi); static int xfrm4_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb) { struct iphdr *iph; int optlen = 0; int err = -EINVAL; skb->protocol = htons(ETH_P_IP); if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) { struct ip_beet_phdr *ph; int phlen; if (!pskb_may_pull(skb, sizeof(*ph))) goto out; ph = (struct ip_beet_phdr *)skb->data; phlen = sizeof(*ph) + ph->padlen; optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen); if (optlen < 0 || optlen & 3 || optlen > 250) goto out; XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr; if (!pskb_may_pull(skb, phlen)) goto out; __skb_pull(skb, phlen); } skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); xfrm4_beet_make_header(skb); iph = ip_hdr(skb); iph->ihl += optlen / 4; iph->tot_len = htons(skb->len); iph->daddr = x->sel.daddr.a4; iph->saddr = x->sel.saddr.a4; iph->check = 0; iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl); err = 0; out: return err; } static void ipip_ecn_decapsulate(struct sk_buff *skb) { struct iphdr *inner_iph = ipip_hdr(skb); if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) IP_ECN_set_ce(inner_iph); } static int xfrm4_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; skb->protocol = htons(ETH_P_IP); if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto out; err = skb_unclone(skb, GFP_ATOMIC); if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip_ecn_decapsulate(skb); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); if (skb->mac_len) eth_hdr(skb)->h_proto = skb->protocol; err = 0; out: return err; } static void ipip6_ecn_decapsulate(struct sk_buff *skb) { struct ipv6hdr *inner_iph = ipipv6_hdr(skb); if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) IP6_ECN_set_ce(skb, inner_iph); } static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; skb->protocol = htons(ETH_P_IPV6); if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto out; err = skb_unclone(skb, GFP_ATOMIC); if (err) goto out; if (x->props.flags & XFRM_STATE_DECAP_DSCP) ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb)); if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); if (skb->mac_len) eth_hdr(skb)->h_proto = skb->protocol; err = 0; out: return err; } static int xfrm6_remove_beet_encap(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *ip6h; int size = sizeof(struct ipv6hdr); int err; skb->protocol = htons(ETH_P_IPV6); err = skb_cow_head(skb, size + skb->mac_len); if (err) goto out; __skb_push(skb, size); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); xfrm6_beet_make_header(skb); ip6h = ipv6_hdr(skb); ip6h->payload_len = htons(skb->len - size); ip6h->daddr = x->sel.daddr.in6; ip6h->saddr = x->sel.saddr.in6; err = 0; out: return err; } /* Remove encapsulation header. * * The IP header will be moved over the top of the encapsulation * header. * * On entry, the transport header shall point to where the IP header * should be and the network header shall be set to where the IP * header currently is. skb->data shall point to the start of the * payload. */ static int xfrm_inner_mode_encap_remove(struct xfrm_state *x, struct sk_buff *skb) { switch (x->props.mode) { case XFRM_MODE_BEET: switch (x->sel.family) { case AF_INET: return xfrm4_remove_beet_encap(x, skb); case AF_INET6: return xfrm6_remove_beet_encap(x, skb); } break; case XFRM_MODE_TUNNEL: switch (XFRM_MODE_SKB_CB(skb)->protocol) { case IPPROTO_IPIP: return xfrm4_remove_tunnel_encap(x, skb); case IPPROTO_IPV6: return xfrm6_remove_tunnel_encap(x, skb); break; } return -EINVAL; } WARN_ON_ONCE(1); return -EOPNOTSUPP; } static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) { switch (x->props.family) { case AF_INET: xfrm4_extract_header(skb); break; case AF_INET6: xfrm6_extract_header(skb); break; default: WARN_ON_ONCE(1); return -EAFNOSUPPORT; } return xfrm_inner_mode_encap_remove(x, skb); } /* Remove encapsulation header. * * The IP header will be moved over the top of the encapsulation header. * * On entry, skb_transport_header() shall point to where the IP header * should be and skb_network_header() shall be set to where the IP header * currently is. skb->data shall point to the start of the payload. */ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { struct xfrm_offload *xo = xfrm_offload(skb); int ihl = skb->data - skb_transport_header(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), skb_network_header(skb), ihl); if (xo) xo->orig_mac_len = skb_mac_header_was_set(skb) ? skb_mac_header_len(skb) : 0; skb->network_header = skb->transport_header; } ip_hdr(skb)->tot_len = htons(skb->len + ihl); skb_reset_transport_header(skb); return 0; } static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct xfrm_offload *xo = xfrm_offload(skb); int ihl = skb->data - skb_transport_header(skb); if (skb->transport_header != skb->network_header) { memmove(skb_transport_header(skb), skb_network_header(skb), ihl); if (xo) xo->orig_mac_len = skb_mac_header_was_set(skb) ? skb_mac_header_len(skb) : 0; skb->network_header = skb->transport_header; } ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); return 0; #else WARN_ON_ONCE(1); return -EAFNOSUPPORT; #endif } static int xfrm_inner_mode_input(struct xfrm_state *x, struct sk_buff *skb) { switch (x->props.mode) { case XFRM_MODE_BEET: case XFRM_MODE_TUNNEL: return xfrm_prepare_input(x, skb); case XFRM_MODE_TRANSPORT: if (x->props.family == AF_INET) return xfrm4_transport_input(x, skb); if (x->props.family == AF_INET6) return xfrm6_transport_input(x, skb); break; case XFRM_MODE_ROUTEOPTIMIZATION: WARN_ON_ONCE(1); break; default: WARN_ON_ONCE(1); break; } return -EOPNOTSUPP; } int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { const struct xfrm_state_afinfo *afinfo; struct net *net = dev_net(skb->dev); int err; __be32 seq; __be32 seq_hi; struct xfrm_state *x = NULL; xfrm_address_t *daddr; u32 mark = skb->mark; unsigned int family = AF_UNSPEC; int decaps = 0; int async = 0; bool xfrm_gro = false; bool crypto_done = false; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; if (encap_type < 0 || (xo && (xo->flags & XFRM_GRO || encap_type == 0 || encap_type == UDP_ENCAP_ESPINUDP))) { x = xfrm_input_state(skb); if (unlikely(x->km.state != XFRM_STATE_VALID)) { if (x->km.state == XFRM_STATE_ACQ) XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); else XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); if (encap_type == -1) dev_put(skb->dev); goto drop; } family = x->props.family; /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { async = 1; seq = XFRM_SKB_CB(skb)->seq.input.low; goto resume; } /* GRO call */ seq = XFRM_SPI_SKB_CB(skb)->seq; if (xo && (xo->flags & CRYPTO_DONE)) { crypto_done = true; family = XFRM_SPI_SKB_CB(skb)->family; if (!(xo->status & CRYPTO_SUCCESS)) { if (xo->status & (CRYPTO_TRANSPORT_AH_AUTH_FAILED | CRYPTO_TRANSPORT_ESP_AUTH_FAILED | CRYPTO_TUNNEL_AH_AUTH_FAILED | CRYPTO_TUNNEL_ESP_AUTH_FAILED)) { xfrm_audit_state_icvfail(x, skb, x->type->proto); x->stats.integrity_failed++; XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); goto drop; } if (xo->status & CRYPTO_INVALID_PROTOCOL) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); goto drop; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } if (xfrm_parse_spi(skb, nexthdr, &spi, &seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } } goto lock; } family = XFRM_SPI_SKB_CB(skb)->family; /* if tunnel is present override skb->mark value with tunnel i_key */ switch (family) { case AF_INET: if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key); break; case AF_INET6: if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6) mark = be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key); break; } sp = secpath_set(skb); if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } seq = 0; if (!spi && xfrm_parse_spi(skb, nexthdr, &spi, &seq)) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } daddr = (xfrm_address_t *)(skb_network_header(skb) + XFRM_SPI_SKB_CB(skb)->daddroff); do { sp = skb_sec_path(skb); if (sp->len == XFRM_MAX_DEPTH) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } x = xfrm_input_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound(skb, family, spi, seq); goto drop; } if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR); xfrm_audit_state_notfound(skb, family, spi, seq); xfrm_state_put(x); x = NULL; goto drop; } skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; skb_dst_force(skb); if (!skb_dst(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } lock: spin_lock(&x->lock); if (unlikely(x->km.state != XFRM_STATE_VALID)) { if (x->km.state == XFRM_STATE_ACQ) XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR); else XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEINVALID); goto drop_unlock; } if ((x->encap ? x->encap->encap_type : 0) != encap_type) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); goto drop_unlock; } if (xfrm_replay_check(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } if (xfrm_state_check_expire(x)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEEXPIRED); goto drop_unlock; } spin_unlock(&x->lock); if (xfrm_tunnel_check(skb, x, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; } seq_hi = htonl(xfrm_replay_seqhi(x, seq)); XFRM_SKB_CB(skb)->seq.input.low = seq; XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; dev_hold(skb->dev); if (crypto_done) nexthdr = x->type_offload->input_tail(x, skb); else nexthdr = x->type->input(x, skb); if (nexthdr == -EINPROGRESS) return 0; resume: dev_put(skb->dev); spin_lock(&x->lock); if (nexthdr < 0) { if (nexthdr == -EBADMSG) { xfrm_audit_state_icvfail(x, skb, x->type->proto); x->stats.integrity_failed++; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); goto drop_unlock; } /* only the first xfrm gets the encap type */ encap_type = 0; if (xfrm_replay_recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } xfrm_replay_advance(x, seq); x->curlft.bytes += skb->len; x->curlft.packets++; x->lastused = ktime_get_real_seconds(); spin_unlock(&x->lock); XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; if (xfrm_inner_mode_input(x, skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; } if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) { decaps = 1; break; } /* * We need the inner address. However, we only get here for * transport mode so the outer address is identical. */ daddr = &x->id.daddr; family = x->props.family; err = xfrm_parse_spi(skb, nexthdr, &spi, &seq); if (err < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } crypto_done = false; } while (!err); err = xfrm_rcv_cb(skb, family, x->type->proto, 0); if (err) goto drop; nf_reset_ct(skb); if (decaps) { sp = skb_sec_path(skb); if (sp) sp->olen = 0; if (skb_valid_dst(skb)) skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; } else { xo = xfrm_offload(skb); if (xo) xfrm_gro = xo->flags & XFRM_GRO; err = -EAFNOSUPPORT; rcu_read_lock(); afinfo = xfrm_state_afinfo_get_rcu(x->props.family); if (likely(afinfo)) err = afinfo->transport_finish(skb, xfrm_gro || async); rcu_read_unlock(); if (xfrm_gro) { sp = skb_sec_path(skb); if (sp) sp->olen = 0; if (skb_valid_dst(skb)) skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; } return err; } drop_unlock: spin_unlock(&x->lock); drop: xfrm_rcv_cb(skb, family, x && x->type ? x->type->proto : nexthdr, -1); kfree_skb(skb); return 0; } EXPORT_SYMBOL(xfrm_input); int xfrm_input_resume(struct sk_buff *skb, int nexthdr) { return xfrm_input(skb, nexthdr, 0, -1); } EXPORT_SYMBOL(xfrm_input_resume); static void xfrm_trans_reinject(struct work_struct *work) { struct xfrm_trans_tasklet *trans = container_of(work, struct xfrm_trans_tasklet, work); struct sk_buff_head queue; struct sk_buff *skb; __skb_queue_head_init(&queue); spin_lock_bh(&trans->queue_lock); skb_queue_splice_init(&trans->queue, &queue); spin_unlock_bh(&trans->queue_lock); local_bh_disable(); while ((skb = __skb_dequeue(&queue))) XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net, NULL, skb); local_bh_enable(); } int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)) { struct xfrm_trans_tasklet *trans; trans = this_cpu_ptr(&xfrm_trans_tasklet); if (skb_queue_len(&trans->queue) >= READ_ONCE(net_hotdata.max_backlog)) return -ENOBUFS; BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); XFRM_TRANS_SKB_CB(skb)->finish = finish; XFRM_TRANS_SKB_CB(skb)->net = net; spin_lock_bh(&trans->queue_lock); __skb_queue_tail(&trans->queue, skb); spin_unlock_bh(&trans->queue_lock); schedule_work(&trans->work); return 0; } EXPORT_SYMBOL(xfrm_trans_queue_net); int xfrm_trans_queue(struct sk_buff *skb, int (*finish)(struct net *, struct sock *, struct sk_buff *)) { return xfrm_trans_queue_net(dev_net(skb->dev), skb, finish); } EXPORT_SYMBOL(xfrm_trans_queue); void __init xfrm_input_init(void) { int err; int i; init_dummy_netdev(&xfrm_napi_dev); err = gro_cells_init(&gro_cells, &xfrm_napi_dev); if (err) gro_cells.cells = NULL; for_each_possible_cpu(i) { struct xfrm_trans_tasklet *trans; trans = &per_cpu(xfrm_trans_tasklet, i); spin_lock_init(&trans->queue_lock); __skb_queue_head_init(&trans->queue); INIT_WORK(&trans->work, xfrm_trans_reinject); } }
1085 214 8 8089 2315 408 311 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 /* SPDX-License-Identifier: GPL-2.0 */ /* rwsem.h: R/W semaphores, public interface * * Written by David Howells (dhowells@redhat.com). * Derived from asm-i386/semaphore.h */ #ifndef _LINUX_RWSEM_H #define _LINUX_RWSEM_H #include <linux/linkage.h> #include <linux/types.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/err.h> #include <linux/cleanup.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ }, #else # define __RWSEM_DEP_MAP_INIT(lockname) #endif #ifndef CONFIG_PREEMPT_RT #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include <linux/osq_lock.h> #endif /* * For an uncontended rwsem, count and owner are the only fields a task * needs to touch when acquiring the rwsem. So they are put next to each * other to increase the chance that they will share the same cacheline. * * In a contended rwsem, the owner is likely the most frequently accessed * field in the structure as the optimistic waiter that holds the osq lock * will spin on owner. For an embedded rwsem, other hot fields in the * containing structure should be moved further away from the rwsem to * reduce the chance that they will share the same cacheline causing * cacheline bouncing problem. */ struct rw_semaphore { atomic_long_t count; /* * Write owner or one of the read owners as well flags regarding * the current state of the rwsem. Can be used as a speculative * check to see if the write owner is running on the cpu. */ atomic_long_t owner; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; #define RWSEM_UNLOCKED_VALUE 0UL #define RWSEM_WRITER_LOCKED (1UL << 0) #define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) static inline int rwsem_is_locked(struct rw_semaphore *sem) { return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE; } static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE); } static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED)); } /* Common initializer macros and functions */ #ifdef CONFIG_DEBUG_RWSEMS # define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname, #else # define __RWSEM_DEBUG_INIT(lockname) #endif #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED, #else #define __RWSEM_OPT_INIT(lockname) #endif #define __RWSEM_INITIALIZER(name) \ { __RWSEM_COUNT_INIT(name), \ .owner = ATOMIC_LONG_INIT(0), \ __RWSEM_OPT_INIT(name) \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ .wait_list = LIST_HEAD_INIT((name).wait_list), \ __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) extern void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key); #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) /* * This is the same regardless of which rwsem implementation that is being used. * It is just a heuristic meant to be called by somebody already holding the * rwsem to see if somebody from an incompatible type is wanting access to the * lock. */ static inline int rwsem_is_contended(struct rw_semaphore *sem) { return !list_empty(&sem->wait_list); } #else /* !CONFIG_PREEMPT_RT */ #include <linux/rwbase_rt.h> struct rw_semaphore { struct rwbase_rt rwbase; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; #define __RWSEM_INITIALIZER(name) \ { \ .rwbase = __RWBASE_INITIALIZER(name), \ __RWSEM_DEP_MAP_INIT(name) \ } #define DECLARE_RWSEM(lockname) \ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) extern void __init_rwsem(struct rw_semaphore *rwsem, const char *name, struct lock_class_key *key); #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) { return rw_base_is_locked(&sem->rwbase); } static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rwsem_is_locked(sem)); } static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rw_base_is_write_locked(&sem->rwbase)); } static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) { return rw_base_is_contended(&sem->rwbase); } #endif /* CONFIG_PREEMPT_RT */ /* * The functions below are the same for all rwsem implementations including * the RT specific variant. */ static inline void rwsem_assert_held(const struct rw_semaphore *sem) { if (IS_ENABLED(CONFIG_LOCKDEP)) lockdep_assert_held(sem); else rwsem_assert_held_nolockdep(sem); } static inline void rwsem_assert_held_write(const struct rw_semaphore *sem) { if (IS_ENABLED(CONFIG_LOCKDEP)) lockdep_assert_held_write(sem); else rwsem_assert_held_write_nolockdep(sem); } /* * lock for reading */ extern void down_read(struct rw_semaphore *sem); extern int __must_check down_read_interruptible(struct rw_semaphore *sem); extern int __must_check down_read_killable(struct rw_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ extern int down_read_trylock(struct rw_semaphore *sem); /* * lock for writing */ extern void down_write(struct rw_semaphore *sem); extern int __must_check down_write_killable(struct rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ extern int down_write_trylock(struct rw_semaphore *sem); /* * release a read lock */ extern void up_read(struct rw_semaphore *sem); /* * release a write lock */ extern void up_write(struct rw_semaphore *sem); DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T)) DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T)) DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0) DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T)) DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T)) /* * downgrade write lock to read lock */ extern void downgrade_write(struct rw_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * nested locking. NOTE: rwsems are not allowed to recurse * (which occurs if the same task tries to acquire the same * lock instance multiple times), but multiple locks of the * same lock class might be taken, if the order of the locks * is always the same. This ordering rule can be expressed * to lockdep via the _nested() APIs, but enumerating the * subclasses that are used. (If the nesting relationship is * static then another method for expressing nested locking is * the explicit definition of lock class keys and the use of * lockdep_set_class() at lock initialization time. * See Documentation/locking/lockdep-design.rst for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass); extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); # define down_write_nest_lock(sem, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ } while (0) /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ extern void down_read_non_owner(struct rw_semaphore *sem); extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) # define down_read_killable_nested(sem, subclass) down_read_killable(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) # define down_write_killable_nested(sem, subclass) down_write_killable(sem) # define down_read_non_owner(sem) down_read(sem) # define up_read_non_owner(sem) up_read(sem) #endif #endif /* _LINUX_RWSEM_H */
116 50 120 81 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * if_alg: User-space algorithm interface * * Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_IF_ALG_H #define _CRYPTO_IF_ALG_H #include <linux/compiler.h> #include <linux/completion.h> #include <linux/if_alg.h> #include <linux/scatterlist.h> #include <linux/types.h> #include <linux/atomic.h> #include <net/sock.h> #include <crypto/aead.h> #include <crypto/skcipher.h> #define ALG_MAX_PAGES 16 struct alg_sock { /* struct sock must be the first member of struct alg_sock */ struct sock sk; struct sock *parent; atomic_t refcnt; atomic_t nokey_refcnt; const struct af_alg_type *type; void *private; }; struct af_alg_control { struct af_alg_iv *iv; int op; unsigned int aead_assoclen; }; struct af_alg_type { void *(*bind)(const char *name, u32 type, u32 mask); void (*release)(void *private); int (*setkey)(void *private, const u8 *key, unsigned int keylen); int (*setentropy)(void *private, sockptr_t entropy, unsigned int len); int (*accept)(void *private, struct sock *sk); int (*accept_nokey)(void *private, struct sock *sk); int (*setauthsize)(void *private, unsigned int authsize); struct proto_ops *ops; struct proto_ops *ops_nokey; struct module *owner; char name[14]; }; struct af_alg_sgl { struct sg_table sgt; struct scatterlist sgl[ALG_MAX_PAGES + 1]; bool need_unpin; }; /* TX SGL entry */ struct af_alg_tsgl { struct list_head list; unsigned int cur; /* Last processed SG entry */ struct scatterlist sg[]; /* Array of SGs forming the SGL */ }; #define MAX_SGL_ENTS ((4096 - sizeof(struct af_alg_tsgl)) / \ sizeof(struct scatterlist) - 1) /* RX SGL entry */ struct af_alg_rsgl { struct af_alg_sgl sgl; struct list_head list; size_t sg_num_bytes; /* Bytes of data in that SGL */ }; /** * struct af_alg_async_req - definition of crypto request * @iocb: IOCB for AIO operations * @sk: Socket the request is associated with * @first_rsgl: First RX SG * @last_rsgl: Pointer to last RX SG * @rsgl_list: Track RX SGs * @tsgl: Private, per request TX SGL of buffers to process * @tsgl_entries: Number of entries in priv. TX SGL * @outlen: Number of output bytes generated by crypto op * @areqlen: Length of this data structure * @cra_u: Cipher request */ struct af_alg_async_req { struct kiocb *iocb; struct sock *sk; struct af_alg_rsgl first_rsgl; struct af_alg_rsgl *last_rsgl; struct list_head rsgl_list; struct scatterlist *tsgl; unsigned int tsgl_entries; unsigned int outlen; unsigned int areqlen; union { struct aead_request aead_req; struct skcipher_request skcipher_req; } cra_u; /* req ctx trails this struct */ }; /** * struct af_alg_ctx - definition of the crypto context * * The crypto context tracks the input data during the lifetime of an AF_ALG * socket. * * @tsgl_list: Link to TX SGL * @iv: IV for cipher operation * @state: Existing state for continuing operation * @aead_assoclen: Length of AAD for AEAD cipher operations * @completion: Work queue for synchronous operation * @used: TX bytes sent to kernel. This variable is used to * ensure that user space cannot cause the kernel * to allocate too much memory in sendmsg operation. * @rcvused: Total RX bytes to be filled by kernel. This variable * is used to ensure user space cannot cause the kernel * to allocate too much memory in a recvmsg operation. * @more: More data to be expected from user space? * @merge: Shall new data from user space be merged into existing * SG? * @enc: Cryptographic operation to be performed when * recvmsg is invoked. * @init: True if metadata has been sent. * @len: Length of memory allocated for this data structure. * @inflight: Non-zero when AIO requests are in flight. */ struct af_alg_ctx { struct list_head tsgl_list; void *iv; void *state; size_t aead_assoclen; struct crypto_wait wait; size_t used; atomic_t rcvused; bool more; bool merge; bool enc; bool init; unsigned int len; unsigned int inflight; }; int af_alg_register_type(const struct af_alg_type *type); int af_alg_unregister_type(const struct af_alg_type *type); int af_alg_release(struct socket *sock); void af_alg_release_parent(struct sock *sk); int af_alg_accept(struct sock *sk, struct socket *newsock, struct proto_accept_arg *arg); void af_alg_free_sg(struct af_alg_sgl *sgl); static inline struct alg_sock *alg_sk(struct sock *sk) { return (struct alg_sock *)sk; } /** * Size of available buffer for sending data from user space to kernel. * * @sk socket of connection to user space * @return number of bytes still available */ static inline int af_alg_sndbuf(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; return max_t(int, max_t(int, sk->sk_sndbuf & PAGE_MASK, PAGE_SIZE) - ctx->used, 0); } /** * Can the send buffer still be written to? * * @sk socket of connection to user space * @return true => writable, false => not writable */ static inline bool af_alg_writable(struct sock *sk) { return PAGE_SIZE <= af_alg_sndbuf(sk); } /** * Size of available buffer used by kernel for the RX user space operation. * * @sk socket of connection to user space * @return number of bytes still available */ static inline int af_alg_rcvbuf(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct af_alg_ctx *ctx = ask->private; return max_t(int, max_t(int, sk->sk_rcvbuf & PAGE_MASK, PAGE_SIZE) - atomic_read(&ctx->rcvused), 0); } /** * Can the RX buffer still be written to? * * @sk socket of connection to user space * @return true => writable, false => not writable */ static inline bool af_alg_readable(struct sock *sk) { return PAGE_SIZE <= af_alg_rcvbuf(sk); } unsigned int af_alg_count_tsgl(struct sock *sk, size_t bytes, size_t offset); void af_alg_pull_tsgl(struct sock *sk, size_t used, struct scatterlist *dst, size_t dst_offset); void af_alg_wmem_wakeup(struct sock *sk); int af_alg_wait_for_data(struct sock *sk, unsigned flags, unsigned min); int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size, unsigned int ivsize); void af_alg_free_resources(struct af_alg_async_req *areq); void af_alg_async_cb(void *data, int err); __poll_t af_alg_poll(struct file *file, struct socket *sock, poll_table *wait); struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk, unsigned int areqlen); int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags, struct af_alg_async_req *areq, size_t maxsize, size_t *outlen); #endif /* _CRYPTO_IF_ALG_H */
90 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Network device features. */ #ifndef _LINUX_NETDEV_FEATURES_H #define _LINUX_NETDEV_FEATURES_H #include <linux/types.h> #include <linux/bitops.h> #include <asm/byteorder.h> typedef u64 netdev_features_t; enum { NETIF_F_SG_BIT, /* Scatter/gather IO. */ NETIF_F_IP_CSUM_BIT, /* Can checksum TCP/UDP over IPv4. */ __UNUSED_NETIF_F_1, NETIF_F_HW_CSUM_BIT, /* Can checksum all the packets. */ NETIF_F_IPV6_CSUM_BIT, /* Can checksum TCP/UDP over IPV6 */ NETIF_F_HIGHDMA_BIT, /* Can DMA to high memory. */ NETIF_F_FRAGLIST_BIT, /* Scatter/gather IO. */ NETIF_F_HW_VLAN_CTAG_TX_BIT, /* Transmit VLAN CTAG HW acceleration */ NETIF_F_HW_VLAN_CTAG_RX_BIT, /* Receive VLAN CTAG HW acceleration */ NETIF_F_HW_VLAN_CTAG_FILTER_BIT,/* Receive filtering on VLAN CTAGs */ NETIF_F_VLAN_CHALLENGED_BIT, /* Device cannot handle VLAN packets */ NETIF_F_GSO_BIT, /* Enable software GSO. */ __UNUSED_NETIF_F_12, __UNUSED_NETIF_F_13, NETIF_F_GRO_BIT, /* Generic receive offload */ NETIF_F_LRO_BIT, /* large receive offload */ /**/NETIF_F_GSO_SHIFT, /* keep the order of SKB_GSO_* bits */ NETIF_F_TSO_BIT /* ... TCPv4 segmentation */ = NETIF_F_GSO_SHIFT, NETIF_F_GSO_ROBUST_BIT, /* ... ->SKB_GSO_DODGY */ NETIF_F_TSO_ECN_BIT, /* ... TCP ECN support */ NETIF_F_TSO_MANGLEID_BIT, /* ... IPV4 ID mangling allowed */ NETIF_F_TSO6_BIT, /* ... TCPv6 segmentation */ NETIF_F_FSO_BIT, /* ... FCoE segmentation */ NETIF_F_GSO_GRE_BIT, /* ... GRE with TSO */ NETIF_F_GSO_GRE_CSUM_BIT, /* ... GRE with csum with TSO */ NETIF_F_GSO_IPXIP4_BIT, /* ... IP4 or IP6 over IP4 with TSO */ NETIF_F_GSO_IPXIP6_BIT, /* ... IP4 or IP6 over IP6 with TSO */ NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */ NETIF_F_GSO_PARTIAL_BIT, /* ... Only segment inner-most L4 * in hardware and all other * headers in software. */ NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */ NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */ NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */ NETIF_F_GSO_UDP_L4_BIT, /* ... UDP payload GSO (not UFO) */ NETIF_F_GSO_FRAGLIST_BIT, /* ... Fraglist GSO */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ NETIF_F_GSO_FRAGLIST_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ __UNUSED_NETIF_F_37, NETIF_F_NTUPLE_BIT, /* N-tuple filters supported */ NETIF_F_RXHASH_BIT, /* Receive hashing offload */ NETIF_F_RXCSUM_BIT, /* Receive checksumming offload */ NETIF_F_NOCACHE_COPY_BIT, /* Use no-cache copyfromuser */ NETIF_F_LOOPBACK_BIT, /* Enable loopback */ NETIF_F_RXFCS_BIT, /* Append FCS to skb pkt data */ NETIF_F_RXALL_BIT, /* Receive errored frames too */ NETIF_F_HW_VLAN_STAG_TX_BIT, /* Transmit VLAN STAG HW acceleration */ NETIF_F_HW_VLAN_STAG_RX_BIT, /* Receive VLAN STAG HW acceleration */ NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */ NETIF_F_HW_L2FW_DOFFLOAD_BIT, /* Allow L2 Forwarding in Hardware */ NETIF_F_HW_TC_BIT, /* Offload TC infrastructure */ NETIF_F_HW_ESP_BIT, /* Hardware ESP transformation offload */ NETIF_F_HW_ESP_TX_CSUM_BIT, /* ESP with TX checksum offload */ NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */ NETIF_F_HW_TLS_TX_BIT, /* Hardware TLS TX offload */ NETIF_F_HW_TLS_RX_BIT, /* Hardware TLS RX offload */ NETIF_F_GRO_HW_BIT, /* Hardware Generic receive offload */ NETIF_F_HW_TLS_RECORD_BIT, /* Offload TLS record */ NETIF_F_GRO_FRAGLIST_BIT, /* Fraglist GRO */ NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */ NETIF_F_GRO_UDP_FWD_BIT, /* Allow UDP GRO for forwarding */ NETIF_F_HW_HSR_TAG_INS_BIT, /* Offload HSR tag insertion */ NETIF_F_HW_HSR_TAG_RM_BIT, /* Offload HSR tag removal */ NETIF_F_HW_HSR_FWD_BIT, /* Offload HSR forwarding */ NETIF_F_HW_HSR_DUP_BIT, /* Offload HSR duplication */ /* * Add your fresh new feature above and remember to update * netdev_features_strings[] in net/ethtool/common.c and maybe * some feature mask #defines below. Please also describe it * in Documentation/networking/netdev-features.rst. */ /**/NETDEV_FEATURE_COUNT }; /* copy'n'paste compression ;) */ #define __NETIF_F_BIT(bit) ((netdev_features_t)1 << (bit)) #define __NETIF_F(name) __NETIF_F_BIT(NETIF_F_##name##_BIT) #define NETIF_F_FCOE_CRC __NETIF_F(FCOE_CRC) #define NETIF_F_FRAGLIST __NETIF_F(FRAGLIST) #define NETIF_F_FSO __NETIF_F(FSO) #define NETIF_F_GRO __NETIF_F(GRO) #define NETIF_F_GRO_HW __NETIF_F(GRO_HW) #define NETIF_F_GSO __NETIF_F(GSO) #define NETIF_F_GSO_ROBUST __NETIF_F(GSO_ROBUST) #define NETIF_F_HIGHDMA __NETIF_F(HIGHDMA) #define NETIF_F_HW_CSUM __NETIF_F(HW_CSUM) #define NETIF_F_HW_VLAN_CTAG_FILTER __NETIF_F(HW_VLAN_CTAG_FILTER) #define NETIF_F_HW_VLAN_CTAG_RX __NETIF_F(HW_VLAN_CTAG_RX) #define NETIF_F_HW_VLAN_CTAG_TX __NETIF_F(HW_VLAN_CTAG_TX) #define NETIF_F_IP_CSUM __NETIF_F(IP_CSUM) #define NETIF_F_IPV6_CSUM __NETIF_F(IPV6_CSUM) #define NETIF_F_LOOPBACK __NETIF_F(LOOPBACK) #define NETIF_F_LRO __NETIF_F(LRO) #define NETIF_F_NOCACHE_COPY __NETIF_F(NOCACHE_COPY) #define NETIF_F_NTUPLE __NETIF_F(NTUPLE) #define NETIF_F_RXCSUM __NETIF_F(RXCSUM) #define NETIF_F_RXHASH __NETIF_F(RXHASH) #define NETIF_F_SCTP_CRC __NETIF_F(SCTP_CRC) #define NETIF_F_SG __NETIF_F(SG) #define NETIF_F_TSO6 __NETIF_F(TSO6) #define NETIF_F_TSO_ECN __NETIF_F(TSO_ECN) #define NETIF_F_TSO __NETIF_F(TSO) #define NETIF_F_VLAN_CHALLENGED __NETIF_F(VLAN_CHALLENGED) #define NETIF_F_RXFCS __NETIF_F(RXFCS) #define NETIF_F_RXALL __NETIF_F(RXALL) #define NETIF_F_GSO_GRE __NETIF_F(GSO_GRE) #define NETIF_F_GSO_GRE_CSUM __NETIF_F(GSO_GRE_CSUM) #define NETIF_F_GSO_IPXIP4 __NETIF_F(GSO_IPXIP4) #define NETIF_F_GSO_IPXIP6 __NETIF_F(GSO_IPXIP6) #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL) #define NETIF_F_GSO_UDP_TUNNEL_CSUM __NETIF_F(GSO_UDP_TUNNEL_CSUM) #define NETIF_F_TSO_MANGLEID __NETIF_F(TSO_MANGLEID) #define NETIF_F_GSO_PARTIAL __NETIF_F(GSO_PARTIAL) #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) #define NETIF_F_GSO_SCTP __NETIF_F(GSO_SCTP) #define NETIF_F_GSO_ESP __NETIF_F(GSO_ESP) #define NETIF_F_GSO_UDP __NETIF_F(GSO_UDP) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) #define NETIF_F_HW_L2FW_DOFFLOAD __NETIF_F(HW_L2FW_DOFFLOAD) #define NETIF_F_HW_TC __NETIF_F(HW_TC) #define NETIF_F_HW_ESP __NETIF_F(HW_ESP) #define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM) #define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT) #define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD) #define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4) #define NETIF_F_HW_TLS_TX __NETIF_F(HW_TLS_TX) #define NETIF_F_HW_TLS_RX __NETIF_F(HW_TLS_RX) #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST) #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST) #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC) #define NETIF_F_GRO_UDP_FWD __NETIF_F(GRO_UDP_FWD) #define NETIF_F_HW_HSR_TAG_INS __NETIF_F(HW_HSR_TAG_INS) #define NETIF_F_HW_HSR_TAG_RM __NETIF_F(HW_HSR_TAG_RM) #define NETIF_F_HW_HSR_FWD __NETIF_F(HW_HSR_FWD) #define NETIF_F_HW_HSR_DUP __NETIF_F(HW_HSR_DUP) /* Finds the next feature with the highest number of the range of start-1 till 0. */ static inline int find_next_netdev_feature(u64 feature, unsigned long start) { /* like BITMAP_LAST_WORD_MASK() for u64 * this sets the most significant 64 - start to 0. */ feature &= ~0ULL >> (-start & ((sizeof(feature) * 8) - 1)); return fls64(feature) - 1; } /* This goes for the MSB to the LSB through the set feature bits, * mask_addr should be a u64 and bit an int */ #define for_each_netdev_feature(mask_addr, bit) \ for ((bit) = find_next_netdev_feature((mask_addr), \ NETDEV_FEATURE_COUNT); \ (bit) >= 0; \ (bit) = find_next_netdev_feature((mask_addr), (bit))) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ #define NETIF_F_NEVER_CHANGE NETIF_F_VLAN_CHALLENGED /* remember that ((t)1 << t_BITS) is undefined in C99 */ #define NETIF_F_ETHTOOL_BITS ((__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) | \ (__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) - 1)) & \ ~NETIF_F_NEVER_CHANGE) /* Segmentation offload feature mask */ #define NETIF_F_GSO_MASK (__NETIF_F_BIT(NETIF_F_GSO_LAST + 1) - \ __NETIF_F_BIT(NETIF_F_GSO_SHIFT)) /* List of IP checksum features. Note that NETIF_F_HW_CSUM should not be * set in features when NETIF_F_IP_CSUM or NETIF_F_IPV6_CSUM are set-- * this would be contradictory */ #define NETIF_F_CSUM_MASK (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ NETIF_F_HW_CSUM) #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | \ NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID) /* List of features with software fallbacks. */ #define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_GSO_SCTP | \ NETIF_F_GSO_UDP_L4 | NETIF_F_GSO_FRAGLIST) /* * If one device supports one of these features, then enable them * for all in netdev_increment_features. */ #define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \ NETIF_F_SG | NETIF_F_HIGHDMA | \ NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED) /* * If one device doesn't support one of these features, then disable it * for all in netdev_increment_features. */ #define NETIF_F_ALL_FOR_ALL (NETIF_F_NOCACHE_COPY | NETIF_F_FSO) /* * If upper/master device has these features disabled, they must be disabled * on all lower/slave devices as well. */ #define NETIF_F_UPPER_DISABLES NETIF_F_LRO /* changeable features with no special hardware requirements */ #define NETIF_F_SOFT_FEATURES (NETIF_F_GSO | NETIF_F_GRO) /* Changeable features with no special hardware requirements that defaults to off. */ #define NETIF_F_SOFT_FEATURES_OFF (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD) #define NETIF_F_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_FILTER | \ NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_CTAG_TX | \ NETIF_F_HW_VLAN_STAG_FILTER | \ NETIF_F_HW_VLAN_STAG_RX | \ NETIF_F_HW_VLAN_STAG_TX) #define NETIF_F_GSO_ENCAP_ALL (NETIF_F_GSO_GRE | \ NETIF_F_GSO_GRE_CSUM | \ NETIF_F_GSO_IPXIP4 | \ NETIF_F_GSO_IPXIP6 | \ NETIF_F_GSO_UDP_TUNNEL | \ NETIF_F_GSO_UDP_TUNNEL_CSUM) #endif /* _LINUX_NETDEV_FEATURES_H */
2049 2048 14781 14583 13 12 13 461 460 1922 4 4 4 402 402 1584 1583 1583 1576 1575 1578 1578 1576 1581 1108 1109 1108 3 1102 3 14833 14633 3401 220 2 2 2 2 2 2 2 220 220 220 218 220 220 1 1 1 1 1 1 1 1 1 1 2874 2893 2896 2879 2879 2876 2883 2896 2895 2885 2876 2880 2895 2877 2881 19 1 1 1 1 7479 5 5 7471 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * Per construction; when: * * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and * begin_new_exec()). */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); p->vfork_done = &kthread->exited; p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param); set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker(int cpu, unsigned int flags, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); if (cpu >= 0) node = cpu_to_node(cpu); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; if (cpu >= 0) kthread_bind(task, cpu); worker->flags = flags; worker->task = task; wake_up_process(task); return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker - create a kthread worker * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker(-1, flags, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker(cpu, flags, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_worker_create(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * del_timer_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); del_timer_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif
23 2 1 1 1 19 5 19 19 14 14 13 13 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _CCID_H #define _CCID_H /* * net/dccp/ccid.h * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * CCID infrastructure */ #include <net/sock.h> #include <linux/compiler.h> #include <linux/dccp.h> #include <linux/list.h> #include <linux/module.h> /* maximum value for a CCID (RFC 4340, 19.5) */ #define CCID_MAX 255 #define CCID_SLAB_NAME_LENGTH 32 struct tcp_info; /** * struct ccid_operations - Interface to Congestion-Control Infrastructure * * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) * @ccid_name: alphabetical identifier string for @ccid_id * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket * * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) * @ccid_hc_rx_packet_recv: implements the HC-receiver side * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender */ struct ccid_operations { unsigned char ccid_id; __u32 ccid_ccmps; const char *ccid_name; struct kmem_cache *ccid_hc_rx_slab, *ccid_hc_tx_slab; char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH]; char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH]; __u32 ccid_hc_rx_obj_size, ccid_hc_tx_obj_size; /* Interface Routines */ int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); void (*ccid_hc_rx_exit)(struct sock *sk); void (*ccid_hc_tx_exit)(struct sock *sk); void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len); int (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len); int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, unsigned int len); void (*ccid_hc_rx_get_info)(struct sock *sk, struct tcp_info *info); void (*ccid_hc_tx_get_info)(struct sock *sk, struct tcp_info *info); int (*ccid_hc_rx_getsockopt)(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen); int (*ccid_hc_tx_getsockopt)(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen); }; extern struct ccid_operations ccid2_ops; #ifdef CONFIG_IP_DCCP_CCID3 extern struct ccid_operations ccid3_ops; #endif int ccid_initialize_builtins(void); void ccid_cleanup_builtins(void); struct ccid { struct ccid_operations *ccid_ops; char ccid_priv[]; }; static inline void *ccid_priv(const struct ccid *ccid) { return (void *)ccid->ccid_priv; } bool ccid_support_check(u8 const *ccid_array, u8 array_len); int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, char __user *, int __user *); struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx); static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) { struct ccid *ccid = dp->dccps_hc_rx_ccid; if (ccid == NULL || ccid->ccid_ops == NULL) return -1; return ccid->ccid_ops->ccid_id; } static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) { struct ccid *ccid = dp->dccps_hc_tx_ccid; if (ccid == NULL || ccid->ccid_ops == NULL) return -1; return ccid->ccid_ops->ccid_id; } void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); /* * Congestion control of queued data packets via CCID decision. * * The TX CCID performs its congestion-control by indicating whether and when a * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). * The following modes are supported via the symbolic constants below: * - timer-based pacing (CCID returns a delay value in milliseconds); * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). */ enum ccid_dequeueing_decision { CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ CCID_PACKET_ERR = 0xF0000, /* error condition */ }; static inline int ccid_packet_dequeue_eval(const int return_code) { if (return_code < 0) return CCID_PACKET_ERR; if (return_code == 0) return CCID_PACKET_SEND_AT_ONCE; if (return_code <= CCID_PACKET_DELAY_MAX) return CCID_PACKET_DELAY; return return_code; } static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); return CCID_PACKET_SEND_AT_ONCE; } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, unsigned int len) { if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); } static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL) ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb); } static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL) ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); } /** * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) * @val: value of @opt * @len: length of @val in bytes */ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len) { if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options) return 0; return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); } /** * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender * Arguments are analogous to ccid_hc_tx_parse_options() */ static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len) { if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options) return 0; return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); } static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL) return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb); return 0; } static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, struct tcp_info *info) { if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL) ccid->ccid_ops->ccid_hc_rx_get_info(sk, info); } static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, struct tcp_info *info) { if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL) ccid->ccid_ops->ccid_hc_tx_get_info(sk, info); } static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, optval, optlen); return rc; } static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, optval, optlen); return rc; } #endif /* _CCID_H */
10 10 10 10 10 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 /* * linux/fs/nls/nls_iso8859-5.c * * Charset iso8859-5 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* 0x90*/ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* 0xa0*/ 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f, /* 0xb0*/ 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* 0xc0*/ 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* 0xd0*/ 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* 0xe0*/ 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* 0xf0*/ 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfd, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, /* 0xa8-0xaf */ }; static const unsigned char page04[256] = { 0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x00, 0xae, 0xaf, /* 0x08-0x0f */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0x38-0x3f */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0xfe, 0xff, /* 0x58-0x5f */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x00, /* 0x10-0x17 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, NULL, page04, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page21, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xa0-0xa7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xad, 0xfe, 0xff, /* 0xa8-0xaf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xb0-0xb7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xd0-0xd7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xf0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xf0-0xf7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xfd, 0xae, 0xaf, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "iso8859-5", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_iso8859_5(void) { return register_nls(&table); } static void __exit exit_nls_iso8859_5(void) { unregister_nls(&table); } module_init(init_nls_iso8859_5) module_exit(exit_nls_iso8859_5) MODULE_DESCRIPTION("NLS ISO 8859-5 (Cyrillic)"); MODULE_LICENSE("Dual BSD/GPL");
6 6 9 10 10 9 8 8 6 8 6 6 6 6 6 8 8 8 6 8 8 209 7 208 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: GPL-2.0-only /* * Input Multitouch Library * * Copyright (c) 2008-2010 Henrik Rydberg */ #include <linux/input/mt.h> #include <linux/export.h> #include <linux/slab.h> #include "input-core-private.h" #define TRKID_SGN ((TRKID_MAX + 1) >> 1) static void copy_abs(struct input_dev *dev, unsigned int dst, unsigned int src) { if (dev->absinfo && test_bit(src, dev->absbit)) { dev->absinfo[dst] = dev->absinfo[src]; dev->absinfo[dst].fuzz = 0; __set_bit(dst, dev->absbit); } } /** * input_mt_init_slots() - initialize MT input slots * @dev: input device supporting MT events and finger tracking * @num_slots: number of slots used by the device * @flags: mt tasks to handle in core * * This function allocates all necessary memory for MT slot handling * in the input device, prepares the ABS_MT_SLOT and * ABS_MT_TRACKING_ID events for use and sets up appropriate buffers. * Depending on the flags set, it also performs pointer emulation and * frame synchronization. * * May be called repeatedly. Returns -EINVAL if attempting to * reinitialize with a different number of slots. */ int input_mt_init_slots(struct input_dev *dev, unsigned int num_slots, unsigned int flags) { struct input_mt *mt = dev->mt; int i; if (!num_slots) return 0; if (mt) return mt->num_slots != num_slots ? -EINVAL : 0; /* Arbitrary limit for avoiding too large memory allocation. */ if (num_slots > 1024) return -EINVAL; mt = kzalloc(struct_size(mt, slots, num_slots), GFP_KERNEL); if (!mt) goto err_mem; mt->num_slots = num_slots; mt->flags = flags; input_set_abs_params(dev, ABS_MT_SLOT, 0, num_slots - 1, 0, 0); input_set_abs_params(dev, ABS_MT_TRACKING_ID, 0, TRKID_MAX, 0, 0); if (flags & (INPUT_MT_POINTER | INPUT_MT_DIRECT)) { __set_bit(EV_KEY, dev->evbit); __set_bit(BTN_TOUCH, dev->keybit); copy_abs(dev, ABS_X, ABS_MT_POSITION_X); copy_abs(dev, ABS_Y, ABS_MT_POSITION_Y); copy_abs(dev, ABS_PRESSURE, ABS_MT_PRESSURE); } if (flags & INPUT_MT_POINTER) { __set_bit(BTN_TOOL_FINGER, dev->keybit); __set_bit(BTN_TOOL_DOUBLETAP, dev->keybit); if (num_slots >= 3) __set_bit(BTN_TOOL_TRIPLETAP, dev->keybit); if (num_slots >= 4) __set_bit(BTN_TOOL_QUADTAP, dev->keybit); if (num_slots >= 5) __set_bit(BTN_TOOL_QUINTTAP, dev->keybit); __set_bit(INPUT_PROP_POINTER, dev->propbit); } if (flags & INPUT_MT_DIRECT) __set_bit(INPUT_PROP_DIRECT, dev->propbit); if (flags & INPUT_MT_SEMI_MT) __set_bit(INPUT_PROP_SEMI_MT, dev->propbit); if (flags & INPUT_MT_TRACK) { unsigned int n2 = num_slots * num_slots; mt->red = kcalloc(n2, sizeof(*mt->red), GFP_KERNEL); if (!mt->red) goto err_mem; } /* Mark slots as 'inactive' */ for (i = 0; i < num_slots; i++) input_mt_set_value(&mt->slots[i], ABS_MT_TRACKING_ID, -1); /* Mark slots as 'unused' */ mt->frame = 1; dev->mt = mt; return 0; err_mem: kfree(mt); return -ENOMEM; } EXPORT_SYMBOL(input_mt_init_slots); /** * input_mt_destroy_slots() - frees the MT slots of the input device * @dev: input device with allocated MT slots * * This function is only needed in error path as the input core will * automatically free the MT slots when the device is destroyed. */ void input_mt_destroy_slots(struct input_dev *dev) { if (dev->mt) { kfree(dev->mt->red); kfree(dev->mt); } dev->mt = NULL; } EXPORT_SYMBOL(input_mt_destroy_slots); /** * input_mt_report_slot_state() - report contact state * @dev: input device with allocated MT slots * @tool_type: the tool type to use in this slot * @active: true if contact is active, false otherwise * * Reports a contact via ABS_MT_TRACKING_ID, and optionally * ABS_MT_TOOL_TYPE. If active is true and the slot is currently * inactive, or if the tool type is changed, a new tracking id is * assigned to the slot. The tool type is only reported if the * corresponding absbit field is set. * * Returns true if contact is active. */ bool input_mt_report_slot_state(struct input_dev *dev, unsigned int tool_type, bool active) { struct input_mt *mt = dev->mt; struct input_mt_slot *slot; int id; if (!mt) return false; slot = &mt->slots[mt->slot]; slot->frame = mt->frame; if (!active) { input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1); return false; } id = input_mt_get_value(slot, ABS_MT_TRACKING_ID); if (id < 0) id = input_mt_new_trkid(mt); input_event(dev, EV_ABS, ABS_MT_TRACKING_ID, id); input_event(dev, EV_ABS, ABS_MT_TOOL_TYPE, tool_type); return true; } EXPORT_SYMBOL(input_mt_report_slot_state); /** * input_mt_report_finger_count() - report contact count * @dev: input device with allocated MT slots * @count: the number of contacts * * Reports the contact count via BTN_TOOL_FINGER, BTN_TOOL_DOUBLETAP, * BTN_TOOL_TRIPLETAP and BTN_TOOL_QUADTAP. * * The input core ensures only the KEY events already setup for * this device will produce output. */ void input_mt_report_finger_count(struct input_dev *dev, int count) { input_event(dev, EV_KEY, BTN_TOOL_FINGER, count == 1); input_event(dev, EV_KEY, BTN_TOOL_DOUBLETAP, count == 2); input_event(dev, EV_KEY, BTN_TOOL_TRIPLETAP, count == 3); input_event(dev, EV_KEY, BTN_TOOL_QUADTAP, count == 4); input_event(dev, EV_KEY, BTN_TOOL_QUINTTAP, count == 5); } EXPORT_SYMBOL(input_mt_report_finger_count); /** * input_mt_report_pointer_emulation() - common pointer emulation * @dev: input device with allocated MT slots * @use_count: report number of active contacts as finger count * * Performs legacy pointer emulation via BTN_TOUCH, ABS_X, ABS_Y and * ABS_PRESSURE. Touchpad finger count is emulated if use_count is true. * * The input core ensures only the KEY and ABS axes already setup for * this device will produce output. */ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count) { struct input_mt *mt = dev->mt; struct input_mt_slot *oldest; int oldid, count, i; if (!mt) return; oldest = NULL; oldid = mt->trkid; count = 0; for (i = 0; i < mt->num_slots; ++i) { struct input_mt_slot *ps = &mt->slots[i]; int id = input_mt_get_value(ps, ABS_MT_TRACKING_ID); if (id < 0) continue; if ((id - oldid) & TRKID_SGN) { oldest = ps; oldid = id; } count++; } input_event(dev, EV_KEY, BTN_TOUCH, count > 0); if (use_count) { if (count == 0 && !test_bit(ABS_MT_DISTANCE, dev->absbit) && test_bit(ABS_DISTANCE, dev->absbit) && input_abs_get_val(dev, ABS_DISTANCE) != 0) { /* * Force reporting BTN_TOOL_FINGER for devices that * only report general hover (and not per-contact * distance) when contact is in proximity but not * on the surface. */ count = 1; } input_mt_report_finger_count(dev, count); } if (oldest) { int x = input_mt_get_value(oldest, ABS_MT_POSITION_X); int y = input_mt_get_value(oldest, ABS_MT_POSITION_Y); input_event(dev, EV_ABS, ABS_X, x); input_event(dev, EV_ABS, ABS_Y, y); if (test_bit(ABS_MT_PRESSURE, dev->absbit)) { int p = input_mt_get_value(oldest, ABS_MT_PRESSURE); input_event(dev, EV_ABS, ABS_PRESSURE, p); } } else { if (test_bit(ABS_MT_PRESSURE, dev->absbit)) input_event(dev, EV_ABS, ABS_PRESSURE, 0); } } EXPORT_SYMBOL(input_mt_report_pointer_emulation); static void __input_mt_drop_unused(struct input_dev *dev, struct input_mt *mt) { int i; lockdep_assert_held(&dev->event_lock); for (i = 0; i < mt->num_slots; i++) { if (input_mt_is_active(&mt->slots[i]) && !input_mt_is_used(mt, &mt->slots[i])) { input_handle_event(dev, EV_ABS, ABS_MT_SLOT, i); input_handle_event(dev, EV_ABS, ABS_MT_TRACKING_ID, -1); } } } /** * input_mt_drop_unused() - Inactivate slots not seen in this frame * @dev: input device with allocated MT slots * * Lift all slots not seen since the last call to this function. */ void input_mt_drop_unused(struct input_dev *dev) { struct input_mt *mt = dev->mt; if (mt) { unsigned long flags; spin_lock_irqsave(&dev->event_lock, flags); __input_mt_drop_unused(dev, mt); mt->frame++; spin_unlock_irqrestore(&dev->event_lock, flags); } } EXPORT_SYMBOL(input_mt_drop_unused); /** * input_mt_release_slots() - Deactivate all slots * @dev: input device with allocated MT slots * * Lift all active slots. */ void input_mt_release_slots(struct input_dev *dev) { struct input_mt *mt = dev->mt; lockdep_assert_held(&dev->event_lock); if (mt) { /* This will effectively mark all slots unused. */ mt->frame++; __input_mt_drop_unused(dev, mt); if (test_bit(ABS_PRESSURE, dev->absbit)) input_handle_event(dev, EV_ABS, ABS_PRESSURE, 0); mt->frame++; } } /** * input_mt_sync_frame() - synchronize mt frame * @dev: input device with allocated MT slots * * Close the frame and prepare the internal state for a new one. * Depending on the flags, marks unused slots as inactive and performs * pointer emulation. */ void input_mt_sync_frame(struct input_dev *dev) { struct input_mt *mt = dev->mt; bool use_count = false; if (!mt) return; if (mt->flags & INPUT_MT_DROP_UNUSED) { unsigned long flags; spin_lock_irqsave(&dev->event_lock, flags); __input_mt_drop_unused(dev, mt); spin_unlock_irqrestore(&dev->event_lock, flags); } if ((mt->flags & INPUT_MT_POINTER) && !(mt->flags & INPUT_MT_SEMI_MT)) use_count = true; input_mt_report_pointer_emulation(dev, use_count); mt->frame++; } EXPORT_SYMBOL(input_mt_sync_frame); static int adjust_dual(int *begin, int step, int *end, int eq, int mu) { int f, *p, s, c; if (begin == end) return 0; f = *begin; p = begin + step; s = p == end ? f + 1 : *p; for (; p != end; p += step) { if (*p < f) { s = f; f = *p; } else if (*p < s) { s = *p; } } c = (f + s + 1) / 2; if (c == 0 || (c > mu && (!eq || mu > 0))) return 0; /* Improve convergence for positive matrices by penalizing overcovers */ if (s < 0 && mu <= 0) c *= 2; for (p = begin; p != end; p += step) *p -= c; return (c < s && s <= 0) || (f >= 0 && f < c); } static void find_reduced_matrix(int *w, int nr, int nc, int nrc, int mu) { int i, k, sum; for (k = 0; k < nrc; k++) { for (i = 0; i < nr; i++) adjust_dual(w + i, nr, w + i + nrc, nr <= nc, mu); sum = 0; for (i = 0; i < nrc; i += nr) sum += adjust_dual(w + i, 1, w + i + nr, nc <= nr, mu); if (!sum) break; } } static int input_mt_set_matrix(struct input_mt *mt, const struct input_mt_pos *pos, int num_pos, int mu) { const struct input_mt_pos *p; struct input_mt_slot *s; int *w = mt->red; int x, y; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (!input_mt_is_active(s)) continue; x = input_mt_get_value(s, ABS_MT_POSITION_X); y = input_mt_get_value(s, ABS_MT_POSITION_Y); for (p = pos; p != pos + num_pos; p++) { int dx = x - p->x, dy = y - p->y; *w++ = dx * dx + dy * dy - mu; } } return w - mt->red; } static void input_mt_set_slots(struct input_mt *mt, int *slots, int num_pos) { struct input_mt_slot *s; int *w = mt->red, j; for (j = 0; j != num_pos; j++) slots[j] = -1; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (!input_mt_is_active(s)) continue; for (j = 0; j != num_pos; j++) { if (w[j] < 0) { slots[j] = s - mt->slots; break; } } w += num_pos; } for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { if (input_mt_is_active(s)) continue; for (j = 0; j != num_pos; j++) { if (slots[j] < 0) { slots[j] = s - mt->slots; break; } } } } /** * input_mt_assign_slots() - perform a best-match assignment * @dev: input device with allocated MT slots * @slots: the slot assignment to be filled * @pos: the position array to match * @num_pos: number of positions * @dmax: maximum ABS_MT_POSITION displacement (zero for infinite) * * Performs a best match against the current contacts and returns * the slot assignment list. New contacts are assigned to unused * slots. * * The assignments are balanced so that all coordinate displacements are * below the euclidian distance dmax. If no such assignment can be found, * some contacts are assigned to unused slots. * * Returns zero on success, or negative error in case of failure. */ int input_mt_assign_slots(struct input_dev *dev, int *slots, const struct input_mt_pos *pos, int num_pos, int dmax) { struct input_mt *mt = dev->mt; int mu = 2 * dmax * dmax; int nrc; if (!mt || !mt->red) return -ENXIO; if (num_pos > mt->num_slots) return -EINVAL; if (num_pos < 1) return 0; nrc = input_mt_set_matrix(mt, pos, num_pos, mu); find_reduced_matrix(mt->red, num_pos, nrc / num_pos, nrc, mu); input_mt_set_slots(mt, slots, num_pos); return 0; } EXPORT_SYMBOL(input_mt_assign_slots); /** * input_mt_get_slot_by_key() - return slot matching key * @dev: input device with allocated MT slots * @key: the key of the sought slot * * Returns the slot of the given key, if it exists, otherwise * set the key on the first unused slot and return. * * If no available slot can be found, -1 is returned. * Note that for this function to work properly, input_mt_sync_frame() has * to be called at each frame. */ int input_mt_get_slot_by_key(struct input_dev *dev, int key) { struct input_mt *mt = dev->mt; struct input_mt_slot *s; if (!mt) return -1; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) if (input_mt_is_active(s) && s->key == key) return s - mt->slots; for (s = mt->slots; s != mt->slots + mt->num_slots; s++) if (!input_mt_is_active(s) && !input_mt_is_used(mt, s)) { s->key = key; return s - mt->slots; } return -1; } EXPORT_SYMBOL(input_mt_get_slot_by_key);
1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 // SPDX-License-Identifier: GPL-2.0 /* * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2022 Intel Corporation * Copyright 2023-2024 NXP */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/iso.h> #include "eir.h" static const struct proto_ops iso_sock_ops; static struct bt_sock_list iso_sk_list = { .lock = __RW_LOCK_UNLOCKED(iso_sk_list.lock) }; /* ---- ISO connections ---- */ struct iso_conn { struct hci_conn *hcon; /* @lock: spinlock protecting changes to iso_conn fields */ spinlock_t lock; struct sock *sk; struct delayed_work timeout_work; struct sk_buff *rx_skb; __u32 rx_len; __u16 tx_sn; struct kref ref; }; #define iso_conn_lock(c) spin_lock(&(c)->lock) #define iso_conn_unlock(c) spin_unlock(&(c)->lock) static void iso_sock_close(struct sock *sk); static void iso_sock_kill(struct sock *sk); /* ----- ISO socket info ----- */ #define iso_pi(sk) ((struct iso_pinfo *)sk) #define EIR_SERVICE_DATA_LENGTH 4 #define BASE_MAX_LENGTH (HCI_MAX_PER_AD_LENGTH - EIR_SERVICE_DATA_LENGTH) #define EIR_BAA_SERVICE_UUID 0x1851 /* iso_pinfo flags values */ enum { BT_SK_BIG_SYNC, BT_SK_PA_SYNC, }; struct iso_pinfo { struct bt_sock bt; bdaddr_t src; __u8 src_type; bdaddr_t dst; __u8 dst_type; __u8 bc_sid; __u8 bc_num_bis; __u8 bc_bis[ISO_MAX_NUM_BIS]; __u16 sync_handle; unsigned long flags; struct bt_iso_qos qos; bool qos_user_set; __u8 base_len; __u8 base[BASE_MAX_LENGTH]; struct iso_conn *conn; }; static struct bt_iso_qos default_qos; static bool check_ucast_qos(struct bt_iso_qos *qos); static bool check_bcast_qos(struct bt_iso_qos *qos); static bool iso_match_sid(struct sock *sk, void *data); static bool iso_match_sync_handle(struct sock *sk, void *data); static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data); static void iso_sock_disconn(struct sock *sk); typedef bool (*iso_sock_match_t)(struct sock *sk, void *data); static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, enum bt_sock_state state, iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ #define ISO_CONN_TIMEOUT (HZ * 40) #define ISO_DISCONN_TIMEOUT (HZ * 2) static void iso_conn_free(struct kref *ref) { struct iso_conn *conn = container_of(ref, struct iso_conn, ref); BT_DBG("conn %p", conn); if (conn->sk) iso_pi(conn->sk)->conn = NULL; if (conn->hcon) { conn->hcon->iso_data = NULL; hci_conn_drop(conn->hcon); } /* Ensure no more work items will run since hci_conn has been dropped */ disable_delayed_work_sync(&conn->timeout_work); kfree(conn); } static void iso_conn_put(struct iso_conn *conn) { if (!conn) return; BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref)); kref_put(&conn->ref, iso_conn_free); } static struct iso_conn *iso_conn_hold_unless_zero(struct iso_conn *conn) { if (!conn) return NULL; BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref)); if (!kref_get_unless_zero(&conn->ref)) return NULL; return conn; } static struct sock *iso_sock_hold(struct iso_conn *conn) { if (!conn || !bt_sock_linked(&iso_sk_list, conn->sk)) return NULL; sock_hold(conn->sk); return conn->sk; } static void iso_sock_timeout(struct work_struct *work) { struct iso_conn *conn = container_of(work, struct iso_conn, timeout_work.work); struct sock *sk; conn = iso_conn_hold_unless_zero(conn); if (!conn) return; iso_conn_lock(conn); sk = iso_sock_hold(conn); iso_conn_unlock(conn); iso_conn_put(conn); if (!sk) return; BT_DBG("sock %p state %d", sk, sk->sk_state); lock_sock(sk); sk->sk_err = ETIMEDOUT; sk->sk_state_change(sk); release_sock(sk); sock_put(sk); } static void iso_sock_set_timer(struct sock *sk, long timeout) { if (!iso_pi(sk)->conn) return; BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); cancel_delayed_work(&iso_pi(sk)->conn->timeout_work); schedule_delayed_work(&iso_pi(sk)->conn->timeout_work, timeout); } static void iso_sock_clear_timer(struct sock *sk) { if (!iso_pi(sk)->conn) return; BT_DBG("sock %p state %d", sk, sk->sk_state); cancel_delayed_work(&iso_pi(sk)->conn->timeout_work); } /* ---- ISO connections ---- */ static struct iso_conn *iso_conn_add(struct hci_conn *hcon) { struct iso_conn *conn = hcon->iso_data; conn = iso_conn_hold_unless_zero(conn); if (conn) { if (!conn->hcon) { iso_conn_lock(conn); conn->hcon = hcon; iso_conn_unlock(conn); } iso_conn_put(conn); return conn; } conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) return NULL; kref_init(&conn->ref); spin_lock_init(&conn->lock); INIT_DELAYED_WORK(&conn->timeout_work, iso_sock_timeout); hcon->iso_data = conn; conn->hcon = hcon; conn->tx_sn = 0; BT_DBG("hcon %p conn %p", hcon, conn); return conn; } /* Delete channel. Must be called on the locked socket. */ static void iso_chan_del(struct sock *sk, int err) { struct iso_conn *conn; struct sock *parent; conn = iso_pi(sk)->conn; iso_pi(sk)->conn = NULL; BT_DBG("sk %p, conn %p, err %d", sk, conn, err); if (conn) { iso_conn_lock(conn); conn->sk = NULL; iso_conn_unlock(conn); iso_conn_put(conn); } sk->sk_state = BT_CLOSED; sk->sk_err = err; parent = bt_sk(sk)->parent; if (parent) { bt_accept_unlink(sk); parent->sk_data_ready(parent); } else { sk->sk_state_change(sk); } sock_set_flag(sk, SOCK_ZAPPED); } static void iso_conn_del(struct hci_conn *hcon, int err) { struct iso_conn *conn = hcon->iso_data; struct sock *sk; conn = iso_conn_hold_unless_zero(conn); if (!conn) return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); /* Kill socket */ iso_conn_lock(conn); sk = iso_sock_hold(conn); iso_conn_unlock(conn); iso_conn_put(conn); if (!sk) { iso_conn_put(conn); return; } lock_sock(sk); iso_sock_clear_timer(sk); iso_chan_del(sk, err); release_sock(sk); sock_put(sk); } static int __iso_chan_add(struct iso_conn *conn, struct sock *sk, struct sock *parent) { BT_DBG("conn %p", conn); if (iso_pi(sk)->conn == conn && conn->sk == sk) return 0; if (conn->sk) { BT_ERR("conn->sk already set"); return -EBUSY; } iso_pi(sk)->conn = conn; conn->sk = sk; if (parent) bt_accept_enqueue(parent, sk, true); return 0; } static int iso_chan_add(struct iso_conn *conn, struct sock *sk, struct sock *parent) { int err; iso_conn_lock(conn); err = __iso_chan_add(conn, sk, parent); iso_conn_unlock(conn); return err; } static inline u8 le_addr_type(u8 bdaddr_type) { if (bdaddr_type == BDADDR_LE_PUBLIC) return ADDR_LE_DEV_PUBLIC; else return ADDR_LE_DEV_RANDOM; } static int iso_connect_bis(struct sock *sk) { struct iso_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err; BT_DBG("%pMR", &iso_pi(sk)->src); hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (!bis_capable(hdev)) { err = -EOPNOTSUPP; goto unlock; } /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } /* Fail if out PHYs are marked as disabled */ if (!iso_pi(sk)->qos.bcast.out.phy) { err = -EINVAL; goto unlock; } /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } else { hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = iso_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; iso_sock_set_timer(sk, sk->sk_sndtimeo); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static int iso_connect_cis(struct sock *sk) { struct iso_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err; BT_DBG("%pMR -> %pMR", &iso_pi(sk)->src, &iso_pi(sk)->dst); hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (!cis_central_capable(hdev)) { err = -EOPNOTSUPP; goto unlock; } /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_ucast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } /* Fail if either PHYs are marked as disabled */ if (!iso_pi(sk)->qos.ucast.in.phy && !iso_pi(sk)->qos.ucast.out.phy) { err = -EINVAL; goto unlock; } /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = iso_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; iso_sock_set_timer(sk, sk->sk_sndtimeo); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static struct bt_iso_qos *iso_sock_get_qos(struct sock *sk) { if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONNECT2) return &iso_pi(sk)->conn->hcon->iso_qos; return &iso_pi(sk)->qos; } static int iso_send_frame(struct sock *sk, struct sk_buff *skb) { struct iso_conn *conn = iso_pi(sk)->conn; struct bt_iso_qos *qos = iso_sock_get_qos(sk); struct hci_iso_data_hdr *hdr; int len = 0; BT_DBG("sk %p len %d", sk, skb->len); if (skb->len > qos->ucast.out.sdu) return -EMSGSIZE; len = skb->len; /* Push ISO data header */ hdr = skb_push(skb, HCI_ISO_DATA_HDR_SIZE); hdr->sn = cpu_to_le16(conn->tx_sn++); hdr->slen = cpu_to_le16(hci_iso_data_len_pack(len, HCI_ISO_STATUS_VALID)); if (sk->sk_state == BT_CONNECTED) hci_send_iso(conn->hcon, skb); else len = -ENOTCONN; return len; } static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb) { struct sock *sk; iso_conn_lock(conn); sk = conn->sk; iso_conn_unlock(conn); if (!sk) goto drop; BT_DBG("sk %p len %d", sk, skb->len); if (sk->sk_state != BT_CONNECTED) goto drop; if (!sock_queue_rcv_skb(sk, skb)) return; drop: kfree_skb(skb); } /* -------- Socket interface ---------- */ static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *src, bdaddr_t *dst) { struct sock *sk; sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (bacmp(&iso_pi(sk)->dst, dst)) continue; if (!bacmp(&iso_pi(sk)->src, src)) return sk; } return NULL; } static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc, __u8 sid) { struct sock *sk; sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (bacmp(&iso_pi(sk)->src, ba)) continue; if (bacmp(&iso_pi(sk)->dst, bc)) continue; if (iso_pi(sk)->bc_sid == sid) return sk; } return NULL; } /* Find socket in given state: * source bdaddr (Unicast) * destination bdaddr (Broadcast only) * match func - pass NULL to ignore * match func data - pass -1 to ignore * Returns closest match. */ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, enum bt_sock_state state, iso_sock_match_t match, void *data) { struct sock *sk = NULL, *sk1 = NULL; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != state) continue; /* Match Broadcast destination */ if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst)) continue; /* Use Match function if provided */ if (match && !match(sk, data)) continue; /* Exact match. */ if (!bacmp(&iso_pi(sk)->src, src)) { sock_hold(sk); break; } /* Closest match */ if (!bacmp(&iso_pi(sk)->src, BDADDR_ANY)) { if (sk1) sock_put(sk1); sk1 = sk; sock_hold(sk1); } } if (sk && sk1) sock_put(sk1); read_unlock(&iso_sk_list.lock); return sk ? sk : sk1; } static struct sock *iso_get_sock_big(struct sock *match_sk, bdaddr_t *src, bdaddr_t *dst, uint8_t big) { struct sock *sk = NULL; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { if (match_sk == sk) continue; /* Look for sockets that have already been * connected to the BIG */ if (sk->sk_state != BT_CONNECTED && sk->sk_state != BT_CONNECT) continue; /* Match Broadcast destination */ if (bacmp(&iso_pi(sk)->dst, dst)) continue; /* Match BIG handle */ if (iso_pi(sk)->qos.bcast.big != big) continue; /* Match source address */ if (bacmp(&iso_pi(sk)->src, src)) continue; sock_hold(sk); break; } read_unlock(&iso_sk_list.lock); return sk; } static void iso_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); iso_conn_put(iso_pi(sk)->conn); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static void iso_sock_cleanup_listen(struct sock *parent) { struct sock *sk; BT_DBG("parent %p", parent); /* Close not yet accepted channels */ while ((sk = bt_accept_dequeue(parent, NULL))) { iso_sock_close(sk); iso_sock_kill(sk); } /* If listening socket has a hcon, properly disconnect it */ if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon) { iso_sock_disconn(parent); return; } parent->sk_state = BT_CLOSED; sock_set_flag(parent, SOCK_ZAPPED); } /* Kill socket (only if zapped and orphan) * Must be called on unlocked socket. */ static void iso_sock_kill(struct sock *sk) { if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket || sock_flag(sk, SOCK_DEAD)) return; BT_DBG("sk %p state %d", sk, sk->sk_state); /* Kill poor orphan */ bt_sock_unlink(&iso_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); sock_put(sk); } static void iso_sock_disconn(struct sock *sk) { struct sock *bis_sk; struct hci_conn *hcon = iso_pi(sk)->conn->hcon; if (test_bit(HCI_CONN_BIG_CREATED, &hcon->flags)) { bis_sk = iso_get_sock_big(sk, &iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->qos.bcast.big); /* If there are any other connected sockets for the * same BIG, just delete the sk and leave the bis * hcon active, in case later rebinding is needed. */ if (bis_sk) { hcon->state = BT_OPEN; hcon->iso_data = NULL; iso_pi(sk)->conn->hcon = NULL; iso_sock_clear_timer(sk); iso_chan_del(sk, bt_to_errno(hcon->abort_reason)); sock_put(bis_sk); return; } } sk->sk_state = BT_DISCONN; iso_conn_lock(iso_pi(sk)->conn); hci_conn_drop(iso_pi(sk)->conn->hcon); iso_pi(sk)->conn->hcon = NULL; iso_conn_unlock(iso_pi(sk)->conn); } static void __iso_sock_close(struct sock *sk) { BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); switch (sk->sk_state) { case BT_LISTEN: iso_sock_cleanup_listen(sk); break; case BT_CONNECT: case BT_CONNECTED: case BT_CONFIG: if (iso_pi(sk)->conn->hcon) iso_sock_disconn(sk); else iso_chan_del(sk, ECONNRESET); break; case BT_CONNECT2: if (iso_pi(sk)->conn->hcon && (test_bit(HCI_CONN_PA_SYNC, &iso_pi(sk)->conn->hcon->flags) || test_bit(HCI_CONN_PA_SYNC_FAILED, &iso_pi(sk)->conn->hcon->flags))) iso_sock_disconn(sk); else iso_chan_del(sk, ECONNRESET); break; case BT_DISCONN: iso_chan_del(sk, ECONNRESET); break; default: sock_set_flag(sk, SOCK_ZAPPED); break; } } /* Must be called on unlocked socket. */ static void iso_sock_close(struct sock *sk) { iso_sock_clear_timer(sk); lock_sock(sk); __iso_sock_close(sk); release_sock(sk); iso_sock_kill(sk); } static void iso_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); if (parent) { sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); } } static struct proto iso_proto = { .name = "ISO", .owner = THIS_MODULE, .obj_size = sizeof(struct iso_pinfo) }; #define DEFAULT_IO_QOS \ { \ .interval = 10000u, \ .latency = 10u, \ .sdu = 40u, \ .phy = BT_ISO_PHY_2M, \ .rtn = 2u, \ } static struct bt_iso_qos default_qos = { .bcast = { .big = BT_ISO_QOS_BIG_UNSET, .bis = BT_ISO_QOS_BIS_UNSET, .sync_factor = 0x01, .packing = 0x00, .framing = 0x00, .in = DEFAULT_IO_QOS, .out = DEFAULT_IO_QOS, .encryption = 0x00, .bcode = {0x00}, .options = 0x00, .skip = 0x0000, .sync_timeout = BT_ISO_SYNC_TIMEOUT, .sync_cte_type = 0x00, .mse = 0x00, .timeout = BT_ISO_SYNC_TIMEOUT, }, }; static struct sock *iso_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; sk = bt_sock_alloc(net, sock, &iso_proto, proto, prio, kern); if (!sk) return NULL; sk->sk_destruct = iso_sock_destruct; sk->sk_sndtimeo = ISO_CONN_TIMEOUT; /* Set address type as public as default src address is BDADDR_ANY */ iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; iso_pi(sk)->qos = default_qos; iso_pi(sk)->sync_handle = -1; bt_sock_link(&iso_sk_list, sk); return sk; } static int iso_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); sock->state = SS_UNCONNECTED; if (sock->type != SOCK_SEQPACKET) return -ESOCKTNOSUPPORT; sock->ops = &iso_sock_ops; sk = iso_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; iso_sock_init(sk, NULL); return 0; } static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int i; BT_DBG("sk %p bc_sid %u bc_num_bis %u", sk, sa->iso_bc->bc_sid, sa->iso_bc->bc_num_bis); if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) return -EINVAL; bacpy(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr); /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type)) return -EINVAL; iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type; if (sa->iso_bc->bc_sid > 0x0f) return -EINVAL; iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid; if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) return -EINVAL; iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis; for (i = 0; i < iso_pi(sk)->bc_num_bis; i++) if (sa->iso_bc->bc_bis[i] < 0x01 || sa->iso_bc->bc_bis[i] > 0x1f) return -EINVAL; memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, iso_pi(sk)->bc_num_bis); return 0; } static int iso_sock_bind_pa_sk(struct sock *sk, struct sockaddr_iso *sa, int addr_len) { int err = 0; if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) { err = -EINVAL; goto done; } if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) { err = -EINVAL; goto done; } iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis; for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++) if (sa->iso_bc->bc_bis[i] < 0x01 || sa->iso_bc->bc_bis[i] > 0x1f) { err = -EINVAL; goto done; } memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, iso_pi(sk)->bc_num_bis); done: return err; } static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bdaddr, sa->iso_bdaddr_type); if (!addr || addr_len < sizeof(struct sockaddr_iso) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; lock_sock(sk); /* Allow the user to bind a PA sync socket to a number * of BISes to sync to. */ if ((sk->sk_state == BT_CONNECT2 || sk->sk_state == BT_CONNECTED) && test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { err = iso_sock_bind_pa_sk(sk, sa, addr_len); goto done; } if (sk->sk_state != BT_OPEN) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) { err = -EINVAL; goto done; } bacpy(&iso_pi(sk)->src, &sa->iso_bdaddr); iso_pi(sk)->src_type = sa->iso_bdaddr_type; /* Check for Broadcast address */ if (addr_len > sizeof(*sa)) { err = iso_sock_bind_bc(sock, addr, addr_len); if (err) goto done; } sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int iso_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int err; BT_DBG("sk %p", sk); if (alen < sizeof(struct sockaddr_iso) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) return -EBADFD; if (sk->sk_type != SOCK_SEQPACKET) return -EINVAL; /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) return -EINVAL; lock_sock(sk); bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr); iso_pi(sk)->dst_type = sa->iso_bdaddr_type; release_sock(sk); if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) err = iso_connect_cis(sk); else err = iso_connect_bis(sk); if (err) return err; lock_sock(sk); if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); } release_sock(sk); return err; } static int iso_listen_bis(struct sock *sk) { struct hci_dev *hdev; int err = 0; struct iso_conn *conn; struct hci_conn *hcon; BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid); write_lock(&iso_sk_list.lock); if (__iso_get_sock_listen_by_sid(&iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); if (err) return err; hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } hcon = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid, &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } err = iso_chan_add(conn, sk, NULL); if (err) { hci_conn_drop(hcon); goto unlock; } hci_dev_put(hdev); unlock: hci_dev_unlock(hdev); return err; } static int iso_listen_cis(struct sock *sk) { int err = 0; BT_DBG("%pMR", &iso_pi(sk)->src); write_lock(&iso_sk_list.lock); if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src, &iso_pi(sk)->dst)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); return err; } static int iso_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p backlog %d", sk, backlog); lock_sock(sk); if (sk->sk_state != BT_BOUND) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } if (!bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) err = iso_listen_cis(sk); else err = iso_listen_bis(sk); if (err) goto done; sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = BT_LISTEN; done: release_sock(sk); return err; } static int iso_sock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; long timeo; int err = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); /* Wait for an incoming connection. (wake-one). */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (1) { if (sk->sk_state != BT_LISTEN) { err = -EBADFD; break; } ch = bt_accept_dequeue(sk, newsock); if (ch) break; if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); } remove_wait_queue(sk_sleep(sk), &wait); if (err) goto done; newsock->state = SS_CONNECTED; BT_DBG("new socket %p", ch); done: release_sock(sk); return err; } static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; BT_DBG("sock %p, sk %p", sock, sk); addr->sa_family = AF_BLUETOOTH; if (peer) { bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst); sa->iso_bdaddr_type = iso_pi(sk)->dst_type; } else { bacpy(&sa->iso_bdaddr, &iso_pi(sk)->src); sa->iso_bdaddr_type = iso_pi(sk)->src_type; } return sizeof(struct sockaddr_iso); } static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb, **frag; size_t mtu; int err; BT_DBG("sock %p, sk %p", sock, sk); err = sock_error(sk); if (err) return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); if (sk->sk_state != BT_CONNECTED) { release_sock(sk); return -ENOTCONN; } mtu = iso_pi(sk)->conn->hcon->mtu; release_sock(sk); skb = bt_skb_sendmsg(sk, msg, len, mtu, HCI_ISO_DATA_HDR_SIZE, 0); if (IS_ERR(skb)) return PTR_ERR(skb); len -= skb->len; BT_DBG("skb %p len %d", sk, skb->len); /* Continuation fragments */ frag = &skb_shinfo(skb)->frag_list; while (len) { struct sk_buff *tmp; tmp = bt_skb_sendmsg(sk, msg, len, mtu, 0, 0); if (IS_ERR(tmp)) { kfree_skb(skb); return PTR_ERR(tmp); } *frag = tmp; len -= tmp->len; skb->len += tmp->len; skb->data_len += tmp->len; BT_DBG("frag %p len %d", *frag, tmp->len); frag = &(*frag)->next; } lock_sock(sk); if (sk->sk_state == BT_CONNECTED) err = iso_send_frame(sk, skb); else err = -ENOTCONN; release_sock(sk); if (err < 0) kfree_skb(skb); return err; } static void iso_conn_defer_accept(struct hci_conn *conn) { struct hci_cp_le_accept_cis cp; struct hci_dev *hdev = conn->hdev; BT_DBG("conn %p", conn); conn->state = BT_CONFIG; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp); } static void iso_conn_big_sync(struct sock *sk) { int err; struct hci_dev *hdev; hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return; /* hci_le_big_create_sync requires hdev lock to be held, since * it enqueues the HCI LE BIG Create Sync command via * hci_cmd_sync_queue_once, which checks hdev flags that might * change. */ hci_dev_lock(hdev); if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { err = hci_le_big_create_sync(hdev, iso_pi(sk)->conn->hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); if (err) bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); } hci_dev_unlock(hdev); } static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct iso_pinfo *pi = iso_pi(sk); BT_DBG("sk %p", sk); if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { lock_sock(sk); switch (sk->sk_state) { case BT_CONNECT2: if (test_bit(BT_SK_PA_SYNC, &pi->flags)) { iso_conn_big_sync(sk); sk->sk_state = BT_LISTEN; } else { iso_conn_defer_accept(pi->conn->hcon); sk->sk_state = BT_CONFIG; } release_sock(sk); return 0; case BT_CONNECTED: if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { iso_conn_big_sync(sk); sk->sk_state = BT_LISTEN; release_sock(sk); return 0; } release_sock(sk); break; case BT_CONNECT: release_sock(sk); return iso_connect_cis(sk); default: release_sock(sk); break; } } return bt_sock_recvmsg(sock, msg, len, flags); } static bool check_io_qos(struct bt_iso_io_qos *qos) { /* If no PHY is enable SDU must be 0 */ if (!qos->phy && qos->sdu) return false; if (qos->interval && (qos->interval < 0xff || qos->interval > 0xfffff)) return false; if (qos->latency && (qos->latency < 0x05 || qos->latency > 0xfa0)) return false; if (qos->phy > BT_ISO_PHY_ANY) return false; return true; } static bool check_ucast_qos(struct bt_iso_qos *qos) { if (qos->ucast.cig > 0xef && qos->ucast.cig != BT_ISO_QOS_CIG_UNSET) return false; if (qos->ucast.cis > 0xef && qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) return false; if (qos->ucast.sca > 0x07) return false; if (qos->ucast.packing > 0x01) return false; if (qos->ucast.framing > 0x01) return false; if (!check_io_qos(&qos->ucast.in)) return false; if (!check_io_qos(&qos->ucast.out)) return false; return true; } static bool check_bcast_qos(struct bt_iso_qos *qos) { if (!qos->bcast.sync_factor) qos->bcast.sync_factor = 0x01; if (qos->bcast.packing > 0x01) return false; if (qos->bcast.framing > 0x01) return false; if (!check_io_qos(&qos->bcast.in)) return false; if (!check_io_qos(&qos->bcast.out)) return false; if (qos->bcast.encryption > 0x01) return false; if (qos->bcast.options > 0x07) return false; if (qos->bcast.skip > 0x01f3) return false; if (!qos->bcast.sync_timeout) qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000) return false; if (qos->bcast.sync_cte_type > 0x1f) return false; if (qos->bcast.mse > 0x1f) return false; if (!qos->bcast.timeout) qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT; if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000) return false; return true; } static int iso_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; struct bt_iso_qos qos = default_qos; u32 opt; BT_DBG("sk %p", sk); lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); else clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; case BT_PKT_STATUS: err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); else clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; case BT_ISO_QOS: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2 && (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags) || sk->sk_state != BT_CONNECTED)) { err = -EINVAL; break; } err = bt_copy_from_sockptr(&qos, sizeof(qos), optval, optlen); if (err) break; iso_pi(sk)->qos = qos; iso_pi(sk)->qos_user_set = true; break; case BT_ISO_BASE: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } if (optlen > sizeof(iso_pi(sk)->base)) { err = -EINVAL; break; } err = bt_copy_from_sockptr(iso_pi(sk)->base, optlen, optval, optlen); if (err) break; iso_pi(sk)->base_len = optlen; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int iso_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, err = 0; struct bt_iso_qos *qos; u8 base_len; u8 *base; BT_DBG("sk %p", sk); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state == BT_CONNECTED) { err = -EINVAL; break; } if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags), (u32 __user *)optval)) err = -EFAULT; break; case BT_PKT_STATUS: if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), (int __user *)optval)) err = -EFAULT; break; case BT_ISO_QOS: qos = iso_sock_get_qos(sk); len = min_t(unsigned int, len, sizeof(*qos)); if (copy_to_user(optval, qos, len)) err = -EFAULT; break; case BT_ISO_BASE: if (sk->sk_state == BT_CONNECTED && !bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) { base_len = iso_pi(sk)->conn->hcon->le_per_adv_data_len; base = iso_pi(sk)->conn->hcon->le_per_adv_data; } else { base_len = iso_pi(sk)->base_len; base = iso_pi(sk)->base; } len = min_t(unsigned int, len, base_len); if (copy_to_user(optval, base, len)) err = -EFAULT; if (put_user(len, optlen)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int iso_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p, how %d", sock, sk, how); if (!sk) return 0; sock_hold(sk); lock_sock(sk); switch (how) { case SHUT_RD: if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; sk->sk_shutdown |= RCV_SHUTDOWN; break; case SHUT_WR: if (sk->sk_shutdown & SEND_SHUTDOWN) goto unlock; sk->sk_shutdown |= SEND_SHUTDOWN; break; case SHUT_RDWR: if (sk->sk_shutdown & SHUTDOWN_MASK) goto unlock; sk->sk_shutdown |= SHUTDOWN_MASK; break; } iso_sock_clear_timer(sk); __iso_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); unlock: release_sock(sk); sock_put(sk); return err; } static int iso_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; iso_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); release_sock(sk); } sock_orphan(sk); iso_sock_kill(sk); return err; } static void iso_sock_ready(struct sock *sk) { BT_DBG("sk %p", sk); if (!sk) return; lock_sock(sk); iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; sk->sk_state_change(sk); release_sock(sk); } static bool iso_match_big(struct sock *sk, void *data) { struct hci_evt_le_big_sync_estabilished *ev = data; return ev->handle == iso_pi(sk)->qos.bcast.big; } static bool iso_match_big_hcon(struct sock *sk, void *data) { struct hci_conn *hcon = data; return hcon->iso_qos.bcast.big == iso_pi(sk)->qos.bcast.big; } static bool iso_match_pa_sync_flag(struct sock *sk, void *data) { return test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } static void iso_conn_ready(struct iso_conn *conn) { struct sock *parent = NULL; struct sock *sk = conn->sk; struct hci_ev_le_big_sync_estabilished *ev = NULL; struct hci_ev_le_pa_sync_established *ev2 = NULL; struct hci_ev_le_per_adv_report *ev3 = NULL; struct hci_conn *hcon; BT_DBG("conn %p", conn); if (sk) { iso_sock_ready(conn->sk); } else { hcon = conn->hcon; if (!hcon) return; if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags)) { /* A BIS slave hcon is notified to the ISO layer * after the Command Complete for the LE Setup * ISO Data Path command is received. Get the * parent socket that matches the hcon BIG handle. */ parent = iso_get_sock(&hcon->src, &hcon->dst, BT_LISTEN, iso_match_big_hcon, hcon); } else if (test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { ev = hci_recv_event_data(hcon->hdev, HCI_EVT_LE_BIG_SYNC_ESTABILISHED); /* Get reference to PA sync parent socket, if it exists */ parent = iso_get_sock(&hcon->src, &hcon->dst, BT_LISTEN, iso_match_pa_sync_flag, NULL); if (!parent && ev) parent = iso_get_sock(&hcon->src, &hcon->dst, BT_LISTEN, iso_match_big, ev); } else if (test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { ev2 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev2) parent = iso_get_sock(&hcon->src, &hcon->dst, BT_LISTEN, iso_match_sid, ev2); } else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) { ev3 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PER_ADV_REPORT); if (ev3) parent = iso_get_sock(&hcon->src, &hcon->dst, BT_LISTEN, iso_match_sync_handle_pa_report, ev3); } if (!parent) parent = iso_get_sock(&hcon->src, BDADDR_ANY, BT_LISTEN, NULL, NULL); if (!parent) return; lock_sock(parent); sk = iso_sock_alloc(sock_net(parent), NULL, BTPROTO_ISO, GFP_ATOMIC, 0); if (!sk) { release_sock(parent); return; } iso_sock_init(sk, parent); bacpy(&iso_pi(sk)->src, &hcon->src); /* Convert from HCI to three-value type */ if (hcon->src_type == ADDR_LE_DEV_PUBLIC) iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; else iso_pi(sk)->src_type = BDADDR_LE_RANDOM; /* If hcon has no destination address (BDADDR_ANY) it means it * was created by HCI_EV_LE_BIG_SYNC_ESTABILISHED or * HCI_EV_LE_PA_SYNC_ESTABLISHED so we need to initialize using * the parent socket destination address. */ if (!bacmp(&hcon->dst, BDADDR_ANY)) { bacpy(&hcon->dst, &iso_pi(parent)->dst); hcon->dst_type = iso_pi(parent)->dst_type; } if (ev3) { iso_pi(sk)->qos = iso_pi(parent)->qos; hcon->iso_qos = iso_pi(sk)->qos; iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis; memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS); set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } bacpy(&iso_pi(sk)->dst, &hcon->dst); iso_pi(sk)->dst_type = hcon->dst_type; iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle; memcpy(iso_pi(sk)->base, iso_pi(parent)->base, iso_pi(parent)->base_len); iso_pi(sk)->base_len = iso_pi(parent)->base_len; hci_conn_hold(hcon); iso_chan_add(conn, sk, parent); if ((ev && ((struct hci_evt_le_big_sync_estabilished *)ev)->status) || (ev2 && ev2->status)) { /* Trigger error signal on child socket */ sk->sk_err = ECONNREFUSED; sk->sk_error_report(sk); } if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) sk->sk_state = BT_CONNECT2; else sk->sk_state = BT_CONNECTED; /* Wake up parent */ parent->sk_data_ready(parent); release_sock(parent); sock_put(parent); } } static bool iso_match_sid(struct sock *sk, void *data) { struct hci_ev_le_pa_sync_established *ev = data; return ev->sid == iso_pi(sk)->bc_sid; } static bool iso_match_sync_handle(struct sock *sk, void *data) { struct hci_evt_le_big_info_adv_report *ev = data; return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle; } static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data) { struct hci_ev_le_per_adv_report *ev = data; return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle; } /* ----- ISO interface with lower layer (HCI) ----- */ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct hci_ev_le_pa_sync_established *ev1; struct hci_evt_le_big_info_adv_report *ev2; struct hci_ev_le_per_adv_report *ev3; struct sock *sk; bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); /* Broadcast receiver requires handling of some events before it can * proceed to establishing a BIG sync: * * 1. HCI_EV_LE_PA_SYNC_ESTABLISHED: The socket may specify a specific * SID to listen to and once sync is estabilished its handle needs to * be stored in iso_pi(sk)->sync_handle so it can be matched once * receiving the BIG Info. * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a * a BIG Info it attempts to check if there any listening socket with * the same sync_handle and if it does then attempt to create a sync. * 3. HCI_EV_LE_PER_ADV_REPORT: When a PA report is received, it is stored * in iso_pi(sk)->base so it can be passed up to user, in the case of a * broadcast sink. */ ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev1) { sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid, ev1); if (sk && !ev1->status) iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); goto done; } ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT); if (ev2) { /* Check if BIGInfo report has already been handled */ sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECTED, iso_match_sync_handle, ev2); if (sk) { sock_put(sk); sk = NULL; goto done; } /* Try to get PA sync socket, if it exists */ sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECT2, iso_match_sync_handle, ev2); if (!sk) sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sync_handle, ev2); if (sk) { int err; struct hci_conn *hcon = iso_pi(sk)->conn->hcon; iso_pi(sk)->qos.bcast.encryption = ev2->encryption; if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { err = hci_le_big_create_sync(hdev, hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); if (err) { bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); sock_put(sk); sk = NULL; } } } goto done; } ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT); if (ev3) { size_t base_len = 0; u8 *base; struct hci_conn *hcon; sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sync_handle_pa_report, ev3); if (!sk) goto done; hcon = iso_pi(sk)->conn->hcon; if (!hcon) goto done; if (ev3->data_status == LE_PA_DATA_TRUNCATED) { /* The controller was unable to retrieve PA data. */ memset(hcon->le_per_adv_data, 0, HCI_MAX_PER_AD_TOT_LEN); hcon->le_per_adv_data_len = 0; hcon->le_per_adv_data_offset = 0; goto done; } if (hcon->le_per_adv_data_offset + ev3->length > HCI_MAX_PER_AD_TOT_LEN) goto done; memcpy(hcon->le_per_adv_data + hcon->le_per_adv_data_offset, ev3->data, ev3->length); hcon->le_per_adv_data_offset += ev3->length; if (ev3->data_status == LE_PA_DATA_COMPLETE) { /* All PA data has been received. */ hcon->le_per_adv_data_len = hcon->le_per_adv_data_offset; hcon->le_per_adv_data_offset = 0; /* Extract BASE */ base = eir_get_service_data(hcon->le_per_adv_data, hcon->le_per_adv_data_len, EIR_BAA_SERVICE_UUID, &base_len); if (!base || base_len > BASE_MAX_LENGTH) goto done; memcpy(iso_pi(sk)->base, base, base_len); iso_pi(sk)->base_len = base_len; } else { /* This is a PA data fragment. Keep pa_data_len set to 0 * until all data has been reassembled. */ hcon->le_per_adv_data_len = 0; } } else { sk = iso_get_sock(&hdev->bdaddr, BDADDR_ANY, BT_LISTEN, NULL, NULL); } done: if (!sk) return 0; if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) *flags |= HCI_PROTO_DEFER; sock_put(sk); return HCI_LM_ACCEPT; } static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) { if (hcon->type != ISO_LINK) { if (hcon->type != LE_LINK) return; /* Check if LE link has failed */ if (status) { struct hci_link *link, *t; list_for_each_entry_safe(link, t, &hcon->link_list, list) iso_conn_del(link->conn, bt_to_errno(status)); return; } /* Create CIS if pending */ hci_le_create_cis_pending(hcon->hdev); return; } BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); /* Similar to the success case, if HCI_CONN_BIG_SYNC_FAILED or * HCI_CONN_PA_SYNC_FAILED is set, queue the failed connection * into the accept queue of the listening socket and wake up * userspace, to inform the user about the event. */ if (!status || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags) || test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { struct iso_conn *conn; conn = iso_conn_add(hcon); if (conn) iso_conn_ready(conn); } else { iso_conn_del(hcon, bt_to_errno(status)); } } static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) { if (hcon->type != ISO_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); iso_conn_del(hcon, bt_to_errno(reason)); } void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { struct iso_conn *conn = hcon->iso_data; __u16 pb, ts, len; if (!conn) goto drop; pb = hci_iso_flags_pb(flags); ts = hci_iso_flags_ts(flags); BT_DBG("conn %p len %d pb 0x%x ts 0x%x", conn, skb->len, pb, ts); switch (pb) { case ISO_START: case ISO_SINGLE: if (conn->rx_len) { BT_ERR("Unexpected start frame (len %d)", skb->len); kfree_skb(conn->rx_skb); conn->rx_skb = NULL; conn->rx_len = 0; } if (ts) { struct hci_iso_ts_data_hdr *hdr; /* TODO: add timestamp to the packet? */ hdr = skb_pull_data(skb, HCI_ISO_TS_DATA_HDR_SIZE); if (!hdr) { BT_ERR("Frame is too short (len %d)", skb->len); goto drop; } len = __le16_to_cpu(hdr->slen); } else { struct hci_iso_data_hdr *hdr; hdr = skb_pull_data(skb, HCI_ISO_DATA_HDR_SIZE); if (!hdr) { BT_ERR("Frame is too short (len %d)", skb->len); goto drop; } len = __le16_to_cpu(hdr->slen); } flags = hci_iso_data_flags(len); len = hci_iso_data_len(len); BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x", len, skb->len, flags); if (len == skb->len) { /* Complete frame received */ hci_skb_pkt_status(skb) = flags & 0x03; iso_recv_frame(conn, skb); return; } if (pb == ISO_SINGLE) { BT_ERR("Frame malformed (len %d, expected len %d)", skb->len, len); goto drop; } if (skb->len > len) { BT_ERR("Frame is too long (len %d, expected len %d)", skb->len, len); goto drop; } /* Allocate skb for the complete frame (with header) */ conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL); if (!conn->rx_skb) goto drop; hci_skb_pkt_status(conn->rx_skb) = flags & 0x03; skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len = len - skb->len; break; case ISO_CONT: BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); if (!conn->rx_len) { BT_ERR("Unexpected continuation frame (len %d)", skb->len); goto drop; } if (skb->len > conn->rx_len) { BT_ERR("Fragment is too long (len %d, expected %d)", skb->len, conn->rx_len); kfree_skb(conn->rx_skb); conn->rx_skb = NULL; conn->rx_len = 0; goto drop; } skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; return; case ISO_END: skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; if (!conn->rx_len) { struct sk_buff *rx_skb = conn->rx_skb; /* Complete frame received. iso_recv_frame * takes ownership of the skb so set the global * rx_skb pointer to NULL first. */ conn->rx_skb = NULL; iso_recv_frame(conn, rx_skb); } break; } drop: kfree_skb(skb); } static struct hci_cb iso_cb = { .name = "ISO", .connect_cfm = iso_connect_cfm, .disconn_cfm = iso_disconn_cfm, }; static int iso_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { seq_printf(f, "%pMR %pMR %d\n", &iso_pi(sk)->src, &iso_pi(sk)->dst, sk->sk_state); } read_unlock(&iso_sk_list.lock); return 0; } DEFINE_SHOW_ATTRIBUTE(iso_debugfs); static struct dentry *iso_debugfs; static const struct proto_ops iso_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = iso_sock_release, .bind = iso_sock_bind, .connect = iso_sock_connect, .listen = iso_sock_listen, .accept = iso_sock_accept, .getname = iso_sock_getname, .sendmsg = iso_sock_sendmsg, .recvmsg = iso_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = iso_sock_shutdown, .setsockopt = iso_sock_setsockopt, .getsockopt = iso_sock_getsockopt }; static const struct net_proto_family iso_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = iso_sock_create, }; static bool iso_inited; bool iso_enabled(void) { return iso_inited; } int iso_init(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr)); if (iso_inited) return -EALREADY; err = proto_register(&iso_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_ISO, &iso_sock_family_ops); if (err < 0) { BT_ERR("ISO socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "iso", &iso_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create ISO proc file"); bt_sock_unregister(BTPROTO_ISO); goto error; } BT_INFO("ISO socket layer initialized"); hci_register_cb(&iso_cb); if (!IS_ERR_OR_NULL(bt_debugfs)) iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); iso_inited = true; return 0; error: proto_unregister(&iso_proto); return err; } int iso_exit(void) { if (!iso_inited) return -EALREADY; bt_procfs_cleanup(&init_net, "iso"); debugfs_remove(iso_debugfs); iso_debugfs = NULL; hci_unregister_cb(&iso_cb); bt_sock_unregister(BTPROTO_ISO); proto_unregister(&iso_proto); iso_inited = false; return 0; }
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/auth.c * * Generic RPC client authentication API. * * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ #include <linux/types.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/hash.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> #include <linux/spinlock.h> #include <trace/events/sunrpc.h> #define RPC_CREDCACHE_DEFAULT_HASHBITS (4) struct rpc_cred_cache { struct hlist_head *hashtable; unsigned int hashbits; spinlock_t lock; }; static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops, [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops, [RPC_AUTH_TLS] = (const struct rpc_authops __force __rcu *)&authtls_ops, }; static LIST_HEAD(cred_unused); static unsigned long number_cred_unused; static struct cred machine_cred = { .usage = ATOMIC_INIT(1), }; /* * Return the machine_cred pointer to be used whenever * the a generic machine credential is needed. */ const struct cred *rpc_machine_cred(void) { return &machine_cred; } EXPORT_SYMBOL_GPL(rpc_machine_cred); #define MAX_HASHTABLE_BITS (14) static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) { unsigned long num; unsigned int nbits; int ret; if (!val) goto out_inval; ret = kstrtoul(val, 0, &num); if (ret) goto out_inval; nbits = fls(num - 1); if (nbits > MAX_HASHTABLE_BITS || nbits < 2) goto out_inval; *(unsigned int *)kp->arg = nbits; return 0; out_inval: return -EINVAL; } static int param_get_hashtbl_sz(char *buffer, const struct kernel_param *kp) { unsigned int nbits; nbits = *(unsigned int *)kp->arg; return sprintf(buffer, "%u\n", 1U << nbits); } #define param_check_hashtbl_sz(name, p) __param_check(name, p, unsigned int); static const struct kernel_param_ops param_ops_hashtbl_sz = { .set = param_set_hashtbl_sz, .get = param_get_hashtbl_sz, }; module_param_named(auth_hashtable_size, auth_hashbits, hashtbl_sz, 0644); MODULE_PARM_DESC(auth_hashtable_size, "RPC credential cache hashtable size"); static unsigned long auth_max_cred_cachesize = ULONG_MAX; module_param(auth_max_cred_cachesize, ulong, 0644); MODULE_PARM_DESC(auth_max_cred_cachesize, "RPC credential maximum total cache size"); static u32 pseudoflavor_to_flavor(u32 flavor) { if (flavor > RPC_AUTH_MAXFLAVOR) return RPC_AUTH_GSS; return flavor; } int rpcauth_register(const struct rpc_authops *ops) { const struct rpc_authops *old; rpc_authflavor_t flavor; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops); if (old == NULL || old == ops) return 0; return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_register); int rpcauth_unregister(const struct rpc_authops *ops) { const struct rpc_authops *old; rpc_authflavor_t flavor; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL); if (old == ops || old == NULL) return 0; return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_unregister); static const struct rpc_authops * rpcauth_get_authops(rpc_authflavor_t flavor) { const struct rpc_authops *ops; if (flavor >= RPC_AUTH_MAXFLAVOR) return NULL; rcu_read_lock(); ops = rcu_dereference(auth_flavors[flavor]); if (ops == NULL) { rcu_read_unlock(); request_module("rpc-auth-%u", flavor); rcu_read_lock(); ops = rcu_dereference(auth_flavors[flavor]); if (ops == NULL) goto out; } if (!try_module_get(ops->owner)) ops = NULL; out: rcu_read_unlock(); return ops; } static void rpcauth_put_authops(const struct rpc_authops *ops) { module_put(ops->owner); } /** * rpcauth_get_pseudoflavor - check if security flavor is supported * @flavor: a security flavor * @info: a GSS mech OID, quality of protection, and service value * * Verifies that an appropriate kernel module is available or already loaded. * Returns an equivalent pseudoflavor, or RPC_AUTH_MAXFLAVOR if "flavor" is * not supported locally. */ rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info) { const struct rpc_authops *ops = rpcauth_get_authops(flavor); rpc_authflavor_t pseudoflavor; if (!ops) return RPC_AUTH_MAXFLAVOR; pseudoflavor = flavor; if (ops->info2flavor != NULL) pseudoflavor = ops->info2flavor(info); rpcauth_put_authops(ops); return pseudoflavor; } EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor); /** * rpcauth_get_gssinfo - find GSS tuple matching a GSS pseudoflavor * @pseudoflavor: GSS pseudoflavor to match * @info: rpcsec_gss_info structure to fill in * * Returns zero and fills in "info" if pseudoflavor matches a * supported mechanism. */ int rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info) { rpc_authflavor_t flavor = pseudoflavor_to_flavor(pseudoflavor); const struct rpc_authops *ops; int result; ops = rpcauth_get_authops(flavor); if (ops == NULL) return -ENOENT; result = -ENOENT; if (ops->flavor2info != NULL) result = ops->flavor2info(pseudoflavor, info); rpcauth_put_authops(ops); return result; } EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { struct rpc_auth *auth = ERR_PTR(-EINVAL); const struct rpc_authops *ops; u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); ops = rpcauth_get_authops(flavor); if (ops == NULL) goto out; auth = ops->create(args, clnt); rpcauth_put_authops(ops); if (IS_ERR(auth)) return auth; if (clnt->cl_auth) rpcauth_release(clnt->cl_auth); clnt->cl_auth = auth; out: return auth; } EXPORT_SYMBOL_GPL(rpcauth_create); void rpcauth_release(struct rpc_auth *auth) { if (!refcount_dec_and_test(&auth->au_count)) return; auth->au_ops->destroy(auth); } static DEFINE_SPINLOCK(rpc_credcache_lock); /* * On success, the caller is responsible for freeing the reference * held by the hashtable */ static bool rpcauth_unhash_cred_locked(struct rpc_cred *cred) { if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) return false; hlist_del_rcu(&cred->cr_hash); return true; } static bool rpcauth_unhash_cred(struct rpc_cred *cred) { spinlock_t *cache_lock; bool ret; if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) return false; cache_lock = &cred->cr_auth->au_credcache->lock; spin_lock(cache_lock); ret = rpcauth_unhash_cred_locked(cred); spin_unlock(cache_lock); return ret; } /* * Initialize RPC credential cache */ int rpcauth_init_credcache(struct rpc_auth *auth) { struct rpc_cred_cache *new; unsigned int hashsize; new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) goto out_nocache; new->hashbits = auth_hashbits; hashsize = 1U << new->hashbits; new->hashtable = kcalloc(hashsize, sizeof(new->hashtable[0]), GFP_KERNEL); if (!new->hashtable) goto out_nohashtbl; spin_lock_init(&new->lock); auth->au_credcache = new; return 0; out_nohashtbl: kfree(new); out_nocache: return -ENOMEM; } EXPORT_SYMBOL_GPL(rpcauth_init_credcache); char * rpcauth_stringify_acceptor(struct rpc_cred *cred) { if (!cred->cr_ops->crstringify_acceptor) return NULL; return cred->cr_ops->crstringify_acceptor(cred); } EXPORT_SYMBOL_GPL(rpcauth_stringify_acceptor); /* * Destroy a list of credentials */ static inline void rpcauth_destroy_credlist(struct list_head *head) { struct rpc_cred *cred; while (!list_empty(head)) { cred = list_entry(head->next, struct rpc_cred, cr_lru); list_del_init(&cred->cr_lru); put_rpccred(cred); } } static void rpcauth_lru_add_locked(struct rpc_cred *cred) { if (!list_empty(&cred->cr_lru)) return; number_cred_unused++; list_add_tail(&cred->cr_lru, &cred_unused); } static void rpcauth_lru_add(struct rpc_cred *cred) { if (!list_empty(&cred->cr_lru)) return; spin_lock(&rpc_credcache_lock); rpcauth_lru_add_locked(cred); spin_unlock(&rpc_credcache_lock); } static void rpcauth_lru_remove_locked(struct rpc_cred *cred) { if (list_empty(&cred->cr_lru)) return; number_cred_unused--; list_del_init(&cred->cr_lru); } static void rpcauth_lru_remove(struct rpc_cred *cred) { if (list_empty(&cred->cr_lru)) return; spin_lock(&rpc_credcache_lock); rpcauth_lru_remove_locked(cred); spin_unlock(&rpc_credcache_lock); } /* * Clear the RPC credential cache, and delete those credentials * that are not referenced. */ void rpcauth_clear_credcache(struct rpc_cred_cache *cache) { LIST_HEAD(free); struct hlist_head *head; struct rpc_cred *cred; unsigned int hashsize = 1U << cache->hashbits; int i; spin_lock(&rpc_credcache_lock); spin_lock(&cache->lock); for (i = 0; i < hashsize; i++) { head = &cache->hashtable[i]; while (!hlist_empty(head)) { cred = hlist_entry(head->first, struct rpc_cred, cr_hash); rpcauth_unhash_cred_locked(cred); /* Note: We now hold a reference to cred */ rpcauth_lru_remove_locked(cred); list_add_tail(&cred->cr_lru, &free); } } spin_unlock(&cache->lock); spin_unlock(&rpc_credcache_lock); rpcauth_destroy_credlist(&free); } /* * Destroy the RPC credential cache */ void rpcauth_destroy_credcache(struct rpc_auth *auth) { struct rpc_cred_cache *cache = auth->au_credcache; if (cache) { auth->au_credcache = NULL; rpcauth_clear_credcache(cache); kfree(cache->hashtable); kfree(cache); } } EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); #define RPC_AUTH_EXPIRY_MORATORIUM (60 * HZ) /* * Remove stale credentials. Avoid sleeping inside the loop. */ static long rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { struct rpc_cred *cred, *next; unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; long freed = 0; list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) { if (nr_to_scan-- == 0) break; if (refcount_read(&cred->cr_count) > 1) { rpcauth_lru_remove_locked(cred); continue; } /* * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ if (time_in_range(cred->cr_expire, expired, jiffies)) continue; if (!rpcauth_unhash_cred(cred)) continue; rpcauth_lru_remove_locked(cred); freed++; list_add_tail(&cred->cr_lru, free); } return freed ? freed : SHRINK_STOP; } static unsigned long rpcauth_cache_do_shrink(int nr_to_scan) { LIST_HEAD(free); unsigned long freed; spin_lock(&rpc_credcache_lock); freed = rpcauth_prune_expired(&free, nr_to_scan); spin_unlock(&rpc_credcache_lock); rpcauth_destroy_credlist(&free); return freed; } /* * Run memory cache shrinker. */ static unsigned long rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) return SHRINK_STOP; /* nothing left, don't come back */ if (list_empty(&cred_unused)) return SHRINK_STOP; return rpcauth_cache_do_shrink(sc->nr_to_scan); } static unsigned long rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return number_cred_unused * sysctl_vfs_cache_pressure / 100; } static void rpcauth_cache_enforce_limit(void) { unsigned long diff; unsigned int nr_to_scan; if (number_cred_unused <= auth_max_cred_cachesize) return; diff = number_cred_unused - auth_max_cred_cachesize; nr_to_scan = 100; if (diff < nr_to_scan) nr_to_scan = diff; rpcauth_cache_do_shrink(nr_to_scan); } /* * Look up a process' credentials in the authentication cache */ struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, int flags, gfp_t gfp) { LIST_HEAD(free); struct rpc_cred_cache *cache = auth->au_credcache; struct rpc_cred *cred = NULL, *entry, *new; unsigned int nr; nr = auth->au_ops->hash_cred(acred, cache->hashbits); rcu_read_lock(); hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); if (cred) break; } rcu_read_unlock(); if (cred != NULL) goto found; new = auth->au_ops->crcreate(auth, acred, flags, gfp); if (IS_ERR(new)) { cred = new; goto out; } spin_lock(&cache->lock); hlist_for_each_entry(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); if (cred) break; } if (cred == NULL) { cred = new; set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); refcount_inc(&cred->cr_count); hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]); } else list_add_tail(&new->cr_lru, &free); spin_unlock(&cache->lock); rpcauth_cache_enforce_limit(); found: if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && cred->cr_ops->cr_init != NULL && !(flags & RPCAUTH_LOOKUP_NEW)) { int res = cred->cr_ops->cr_init(auth, cred); if (res < 0) { put_rpccred(cred); cred = ERR_PTR(res); } } rpcauth_destroy_credlist(&free); out: return cred; } EXPORT_SYMBOL_GPL(rpcauth_lookup_credcache); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *auth, int flags) { struct auth_cred acred; struct rpc_cred *ret; const struct cred *cred = current_cred(); memset(&acred, 0, sizeof(acred)); acred.cred = cred; ret = auth->au_ops->lookup_cred(auth, &acred, flags); return ret; } EXPORT_SYMBOL_GPL(rpcauth_lookupcred); void rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, struct rpc_auth *auth, const struct rpc_credops *ops) { INIT_HLIST_NODE(&cred->cr_hash); INIT_LIST_HEAD(&cred->cr_lru); refcount_set(&cred->cr_count, 1); cred->cr_auth = auth; cred->cr_flags = 0; cred->cr_ops = ops; cred->cr_expire = jiffies; cred->cr_cred = get_cred(acred->cred); } EXPORT_SYMBOL_GPL(rpcauth_init_cred); static struct rpc_cred * rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .cred = get_task_cred(&init_task), }; struct rpc_cred *ret; if (RPC_IS_ASYNC(task)) lookupflags |= RPCAUTH_LOOKUP_ASYNC; ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags); put_cred(acred.cred); return ret; } static struct rpc_cred * rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .principal = task->tk_client->cl_principal, .cred = init_task.cred, }; if (!acred.principal) return NULL; if (RPC_IS_ASYNC(task)) lookupflags |= RPCAUTH_LOOKUP_ASYNC; return auth->au_ops->lookup_cred(auth, &acred, lookupflags); } static struct rpc_cred * rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; return rpcauth_lookupcred(auth, lookupflags); } static int rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_cred *new = NULL; int lookupflags = 0; struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { .cred = cred, }; if (flags & RPC_TASK_ASYNC) lookupflags |= RPCAUTH_LOOKUP_NEW | RPCAUTH_LOOKUP_ASYNC; if (task->tk_op_cred) /* Task must use exactly this rpc_cred */ new = get_rpccred(task->tk_op_cred); else if (cred != NULL && cred != &machine_cred) new = auth->au_ops->lookup_cred(auth, &acred, lookupflags); else if (cred == &machine_cred) new = rpcauth_bind_machine_cred(task, lookupflags); /* If machine cred couldn't be bound, try a root cred */ if (new) ; else if (cred == &machine_cred) new = rpcauth_bind_root_cred(task, lookupflags); else if (flags & RPC_TASK_NULLCREDS) new = authnull_ops.lookup_cred(NULL, NULL, 0); else new = rpcauth_bind_new_cred(task, lookupflags); if (IS_ERR(new)) return PTR_ERR(new); put_rpccred(req->rq_cred); req->rq_cred = new; return 0; } void put_rpccred(struct rpc_cred *cred) { if (cred == NULL) return; rcu_read_lock(); if (refcount_dec_and_test(&cred->cr_count)) goto destroy; if (refcount_read(&cred->cr_count) != 1 || !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) goto out; if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { cred->cr_expire = jiffies; rpcauth_lru_add(cred); /* Race breaker */ if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))) rpcauth_lru_remove(cred); } else if (rpcauth_unhash_cred(cred)) { rpcauth_lru_remove(cred); if (refcount_dec_and_test(&cred->cr_count)) goto destroy; } out: rcu_read_unlock(); return; destroy: rcu_read_unlock(); cred->cr_ops->crdestroy(cred); } EXPORT_SYMBOL_GPL(put_rpccred); /** * rpcauth_marshcred - Append RPC credential to end of @xdr * @task: controlling RPC task * @xdr: xdr_stream containing initial portion of RPC Call header * * On success, an appropriate verifier is added to @xdr, @xdr is * updated to point past the verifier, and zero is returned. * Otherwise, @xdr is in an undefined state and a negative errno * is returned. */ int rpcauth_marshcred(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crmarshal(task, xdr); } /** * rpcauth_wrap_req_encode - XDR encode the RPC procedure * @task: controlling RPC task * @xdr: stream where on-the-wire bytes are to be marshalled * * On success, @xdr contains the encoded and wrapped message. * Otherwise, @xdr is in an undefined state. */ int rpcauth_wrap_req_encode(struct rpc_task *task, struct xdr_stream *xdr) { kxdreproc_t encode = task->tk_msg.rpc_proc->p_encode; encode(task->tk_rqstp, xdr, task->tk_msg.rpc_argp); return 0; } EXPORT_SYMBOL_GPL(rpcauth_wrap_req_encode); /** * rpcauth_wrap_req - XDR encode and wrap the RPC procedure * @task: controlling RPC task * @xdr: stream where on-the-wire bytes are to be marshalled * * On success, @xdr contains the encoded and wrapped message, * and zero is returned. Otherwise, @xdr is in an undefined * state and a negative errno is returned. */ int rpcauth_wrap_req(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crwrap_req(task, xdr); } /** * rpcauth_checkverf - Validate verifier in RPC Reply header * @task: controlling RPC task * @xdr: xdr_stream containing RPC Reply header * * Return values: * %0: Verifier is valid. @xdr now points past the verifier. * %-EIO: Verifier is corrupted or message ended early. * %-EACCES: Verifier is intact but not valid. * %-EPROTONOSUPPORT: Server does not support the requested auth type. * * When a negative errno is returned, @xdr is left in an undefined * state. */ int rpcauth_checkverf(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crvalidate(task, xdr); } /** * rpcauth_unwrap_resp_decode - Invoke XDR decode function * @task: controlling RPC task * @xdr: stream where the Reply message resides * * Returns zero on success; otherwise a negative errno is returned. */ int rpcauth_unwrap_resp_decode(struct rpc_task *task, struct xdr_stream *xdr) { kxdrdproc_t decode = task->tk_msg.rpc_proc->p_decode; return decode(task->tk_rqstp, xdr, task->tk_msg.rpc_resp); } EXPORT_SYMBOL_GPL(rpcauth_unwrap_resp_decode); /** * rpcauth_unwrap_resp - Invoke unwrap and decode function for the cred * @task: controlling RPC task * @xdr: stream where the Reply message resides * * Returns zero on success; otherwise a negative errno is returned. */ int rpcauth_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr) { const struct rpc_credops *ops = task->tk_rqstp->rq_cred->cr_ops; return ops->crunwrap_resp(task, xdr); } bool rpcauth_xmit_need_reencode(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; if (!cred || !cred->cr_ops->crneed_reencode) return false; return cred->cr_ops->crneed_reencode(task); } int rpcauth_refreshcred(struct rpc_task *task) { struct rpc_cred *cred; int err; cred = task->tk_rqstp->rq_cred; if (cred == NULL) { err = rpcauth_bindcred(task, task->tk_msg.rpc_cred, task->tk_flags); if (err < 0) goto out; cred = task->tk_rqstp->rq_cred; } err = cred->cr_ops->crrefresh(task); out: if (err < 0) task->tk_status = err; return err; } void rpcauth_invalcred(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; if (cred) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); } int rpcauth_uptodatecred(struct rpc_task *task) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; return cred == NULL || test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0; } static struct shrinker *rpc_cred_shrinker; int __init rpcauth_init_module(void) { int err; err = rpc_init_authunix(); if (err < 0) goto out1; rpc_cred_shrinker = shrinker_alloc(0, "sunrpc_cred"); if (!rpc_cred_shrinker) { err = -ENOMEM; goto out2; } rpc_cred_shrinker->count_objects = rpcauth_cache_shrink_count; rpc_cred_shrinker->scan_objects = rpcauth_cache_shrink_scan; shrinker_register(rpc_cred_shrinker); return 0; out2: rpc_destroy_authunix(); out1: return err; } void rpcauth_remove_module(void) { rpc_destroy_authunix(); shrinker_free(rpc_cred_shrinker); }
50 7648 12880 1 9 65 23 23 1995 51 68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_WAIT_H #define _LINUX_WAIT_H /* * Linux wait queue related types and methods */ #include <linux/list.h> #include <linux/stddef.h> #include <linux/spinlock.h> #include <asm/current.h> typedef struct wait_queue_entry wait_queue_entry_t; typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); /* wait_queue_entry::flags */ #define WQ_FLAG_EXCLUSIVE 0x01 #define WQ_FLAG_WOKEN 0x02 #define WQ_FLAG_CUSTOM 0x04 #define WQ_FLAG_DONE 0x08 #define WQ_FLAG_PRIORITY 0x10 /* * A single wait-queue entry structure: */ struct wait_queue_entry { unsigned int flags; void *private; wait_queue_func_t func; struct list_head entry; }; struct wait_queue_head { spinlock_t lock; struct list_head head; }; typedef struct wait_queue_head wait_queue_head_t; struct task_struct; /* * Macros for declaration and initialisaton of the datatypes */ #define __WAITQUEUE_INITIALIZER(name, tsk) { \ .private = tsk, \ .func = default_wake_function, \ .entry = { NULL, NULL } } #define DECLARE_WAITQUEUE(name, tsk) \ struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk) #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .head = LIST_HEAD_INIT(name.head) } #define DECLARE_WAIT_QUEUE_HEAD(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *); #define init_waitqueue_head(wq_head) \ do { \ static struct lock_class_key __key; \ \ __init_waitqueue_head((wq_head), #wq_head, &__key); \ } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ ({ init_waitqueue_head(&name); name; }) # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) #else # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name) #endif static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p) { wq_entry->flags = 0; wq_entry->private = p; wq_entry->func = default_wake_function; } static inline void init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func) { wq_entry->flags = 0; wq_entry->private = NULL; wq_entry->func = func; } /** * waitqueue_active -- locklessly test for waiters on the queue * @wq_head: the waitqueue to test for waiters * * returns true if the wait list is not empty * * NOTE: this function is lockless and requires care, incorrect usage _will_ * lead to sporadic and non-obvious failure. * * Use either while holding wait_queue_head::lock or when used for wakeups * with an extra smp_mb() like:: * * CPU0 - waker CPU1 - waiter * * for (;;) { * @cond = true; prepare_to_wait(&wq_head, &wait, state); * smp_mb(); // smp_mb() from set_current_state() * if (waitqueue_active(wq_head)) if (@cond) * wake_up(wq_head); break; * schedule(); * } * finish_wait(&wq_head, &wait); * * Because without the explicit smp_mb() it's possible for the * waitqueue_active() load to get hoisted over the @cond store such that we'll * observe an empty wait list while the waiter might not observe @cond. * * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), * which (when the lock is uncontended) are of roughly equal cost. */ static inline int waitqueue_active(struct wait_queue_head *wq_head) { return !list_empty(&wq_head->head); } /** * wq_has_single_sleeper - check if there is only one sleeper * @wq_head: wait queue head * * Returns true of wq_head has only one sleeper on the list. * * Please refer to the comment for waitqueue_active. */ static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head) { return list_is_singular(&wq_head->head); } /** * wq_has_sleeper - check if there are any waiting processes * @wq_head: wait queue head * * Returns true if wq_head has waiting processes * * Please refer to the comment for waitqueue_active. */ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) { /* * We need to be sure we are in sync with the * add_wait_queue modifications to the wait queue. * * This memory barrier should be paired with one on the * waiting side. */ smp_mb(); return waitqueue_active(wq_head); } extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { struct list_head *head = &wq_head->head; struct wait_queue_entry *wq; list_for_each_entry(wq, &wq_head->head, entry) { if (!(wq->flags & WQ_FLAG_PRIORITY)) break; head = &wq->entry; } list_add(&wq_entry->entry, head); } /* * Used for wake-one threads: */ static inline void __add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq_head, wq_entry); } static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_add_tail(&wq_entry->entry, &wq_head->head); } static inline void __add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_entry_tail(wq_head, wq_entry); } static inline void __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_del(&wq_entry->entry); } int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) #define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) #define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) #define wake_up_sync(x) __wake_up_sync(x, TASK_NORMAL) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) #define wake_up_interruptible_sync(x) __wake_up_sync((x), TASK_INTERRUPTIBLE) /* * Wakeup macros to be used to report events to the targets. */ #define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m)) #define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m)) #define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, poll_to_key(m)) #define wake_up_poll_on_current_cpu(x, m) \ __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m)) #define wake_up_locked_poll(x, m) \ __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m)) #define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m)) #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) #define wake_up_interruptible_sync_poll_locked(x, m) \ __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) /** * wake_up_pollfree - signal that a polled waitqueue is going away * @wq_head: the wait queue head * * In the very rare cases where a ->poll() implementation uses a waitqueue whose * lifetime is tied to a task rather than to the 'struct file' being polled, * this function must be called before the waitqueue is freed so that * non-blocking polls (e.g. epoll) are notified that the queue is going away. * * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. */ static inline void wake_up_pollfree(struct wait_queue_head *wq_head) { /* * For performance reasons, we don't always take the queue lock here. * Therefore, we might race with someone removing the last entry from * the queue, and proceed while they still hold the queue lock. * However, rcu_read_lock() is required to be held in such cases, so we * can safely proceed with an RCU-delayed free. */ if (waitqueue_active(wq_head)) __wake_up_pollfree(wq_head); } #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ if (__cond && !__ret) \ __ret = 1; \ __cond || !__ret; \ }) #define ___wait_is_interruptible(state) \ (!__builtin_constant_p(state) || \ (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); /* * The below macro ___wait_event() has an explicit shadow of the __ret * variable when used from the wait_event_*() macros. * * This is so that both can use the ___wait_cond_timeout() construct * to wrap the condition. * * The type inconsistency of the wait_event_*() __ret variable is also * on purpose; we use long where we can return timeout values and int * otherwise. */ #define ___wait_event(wq_head, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ struct wait_queue_entry __wq_entry; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\ \ if (condition) \ break; \ \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ goto __out; \ } \ \ cmd; \ } \ finish_wait(&wq_head, &__wq_entry); \ __out: __ret; \ }) #define __wait_event(wq_head, condition) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event(wq_head, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __wait_event(wq_head, condition); \ } while (0) #define __io_wait_event(wq_head, condition) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) /* * io_wait_event() -- like wait_event() but with io_schedule() */ #define io_wait_event(wq_head, condition) \ do { \ might_sleep(); \ if (condition) \ break; \ __io_wait_event(wq_head, condition); \ } while (0) #define __wait_event_freezable(wq_head, condition) \ ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), \ 0, 0, schedule()) /** * wait_event_freezable - sleep (or freeze) until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute * to system load) until the @condition evaluates to true. The * @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event_freezable(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_freezable(wq_head, condition); \ __ret; \ }) #define __wait_event_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout, \ __ret = schedule_timeout(__ret)) /* * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid * increasing load and is freezable. */ #define wait_event_freezable_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0, \ cmd1; schedule(); cmd2) /* * Just like wait_event_cmd(), except it sets exclusive flag */ #define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2) \ do { \ if (condition) \ break; \ __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) #define __wait_event_cmd(wq_head, condition, cmd1, cmd2) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ cmd1; schedule(); cmd2) /** * wait_event_cmd - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @cmd1: the command will be executed before sleep * @cmd2: the command will be executed after sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ #define wait_event_cmd(wq_head, condition, cmd1, cmd2) \ do { \ if (condition) \ break; \ __wait_event_cmd(wq_head, condition, cmd1, cmd2); \ } while (0) #define __wait_event_interruptible(wq_head, condition) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** * wait_event_interruptible - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible(wq_head, condition); \ __ret; \ }) #define __wait_event_interruptible_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_INTERRUPTIBLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a signal. */ #define wait_event_interruptible_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_interruptible_timeout(wq_head, \ condition, timeout); \ __ret; \ }) #define __wait_event_hrtimeout(wq_head, condition, timeout, state) \ ({ \ int __ret = 0; \ struct hrtimer_sleeper __t; \ \ hrtimer_setup_sleeper_on_stack(&__t, CLOCK_MONOTONIC, \ HRTIMER_MODE_REL); \ if ((timeout) != KTIME_MAX) { \ hrtimer_set_expires_range_ns(&__t.timer, timeout, \ current->timer_slack_ns); \ hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL); \ } \ \ __ret = ___wait_event(wq_head, condition, state, 0, 0, \ if (!__t.task) { \ __ret = -ETIME; \ break; \ } \ schedule()); \ \ hrtimer_cancel(&__t.timer); \ destroy_hrtimer_on_stack(&__t.timer); \ __ret; \ }) /** * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if @condition became true, or -ETIME if the timeout * elapsed. */ #define wait_event_hrtimeout(wq_head, condition, timeout) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ }) /** * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, as a ktime_t * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function returns 0 if @condition became true, -ERESTARTSYS if it was * interrupted by a signal, or -ETIME if the timeout elapsed. */ #define wait_event_interruptible_hrtimeout(wq, condition, timeout) \ ({ \ long __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ }) #define __wait_event_interruptible_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ schedule()) #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_interruptible_exclusive(wq, condition); \ __ret; \ }) #define __wait_event_killable_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 1, 0, \ schedule()) #define wait_event_killable_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable_exclusive(wq, condition); \ __ret; \ }) #define __wait_event_freezable_exclusive(wq, condition) \ ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\ schedule()) #define wait_event_freezable_exclusive(wq, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_freezable_exclusive(wq, condition); \ __ret; \ }) /** * wait_event_idle - wait for a condition without contributing to system load * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * */ #define wait_event_idle(wq_head, condition) \ do { \ might_sleep(); \ if (!(condition)) \ ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule()); \ } while (0) /** * wait_event_idle_exclusive - wait for a condition with contributing to system load * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. * The @condition is checked each time the waitqueue @wq_head is woken up. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus if other processes wait on the same list, when this * process is woken further processes are not considered. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * */ #define wait_event_idle_exclusive(wq_head, condition) \ do { \ might_sleep(); \ if (!(condition)) \ ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule()); \ } while (0) #define __wait_event_idle_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_IDLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_idle_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_idle_timeout(wq_head, condition, timeout); \ __ret; \ }) #define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_IDLE, 1, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_IDLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus if other processes wait on the same list, when this * process is woken further processes are not considered. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * or the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed. */ #define wait_event_idle_exclusive_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\ __ret; \ }) extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *); extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); #define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \ ({ \ int __ret; \ DEFINE_WAIT(__wait); \ if (exclusive) \ __wait.flags |= WQ_FLAG_EXCLUSIVE; \ do { \ __ret = fn(&(wq), &__wait); \ if (__ret) \ break; \ } while (!(condition)); \ __remove_wait_queue(&(wq), &__wait); \ __set_current_state(TASK_RUNNING); \ __ret; \ }) /** * wait_event_interruptible_locked - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock()/spin_unlock() * functions which must match the way they are locked/unlocked outside * of this macro. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_locked(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr)) /** * wait_event_interruptible_locked_irq - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq() * functions which must match the way they are locked/unlocked outside * of this macro. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_locked_irq(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq)) /** * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock()/spin_unlock() * functions which must match the way they are locked/unlocked outside * of this macro. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus when other process waits process on the list if this * process is awaken further processes are not considered. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_exclusive_locked(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr)) /** * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq is woken up. * * It must be called with wq.lock being held. This spinlock is * unlocked while sleeping but @condition testing is done while lock * is held and when this macro exits the lock is held. * * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq() * functions which must match the way they are locked/unlocked outside * of this macro. * * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag * set thus when other process waits process on the list if this * process is awaken further processes are not considered. * * wake_up_locked() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_interruptible_exclusive_locked_irq(wq, condition) \ ((condition) \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq)) #define __wait_event_killable(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule()) /** * wait_event_killable - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a * signal and 0 if @condition evaluated to true. */ #define wait_event_killable(wq_head, condition) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_killable(wq_head, condition); \ __ret; \ }) #define __wait_event_state(wq, condition, state) \ ___wait_event(wq, condition, state, 0, 0, schedule()) /** * wait_event_state - sleep until a condition gets true * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @state: state to sleep in * * The process is put to sleep (@state) until the @condition evaluates to true * or a signal is received (when allowed by @state). The @condition is checked * each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * The function will return -ERESTARTSYS if it was interrupted by a signal * (when allowed by @state) and 0 if @condition evaluated to true. */ #define wait_event_state(wq_head, condition, state) \ ({ \ int __ret = 0; \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_state(wq_head, condition, state); \ __ret; \ }) #define __wait_event_killable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_KILLABLE, 0, timeout, \ __ret = schedule_timeout(__ret)) /** * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_KILLABLE) until the * @condition evaluates to true or a kill signal is received. * The @condition is checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * Returns: * 0 if the @condition evaluated to %false after the @timeout elapsed, * 1 if the @condition evaluated to %true after the @timeout elapsed, * the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was * interrupted by a kill signal. * * Only kill signals interrupt this process. */ #define wait_event_killable_timeout(wq_head, condition, timeout) \ ({ \ long __ret = timeout; \ might_sleep(); \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_killable_timeout(wq_head, \ condition, timeout); \ __ret; \ }) #define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ spin_lock_irq(&lock)) /** * wait_event_lock_irq_cmd - sleep until a condition gets true. The * condition is checked under the lock. This * is expected to be called with the lock * taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd * and schedule() and reacquired afterwards. * @cmd: a command which is invoked outside the critical section before * sleep * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. */ #define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd) \ do { \ if (condition) \ break; \ __wait_event_lock_irq(wq_head, condition, lock, cmd); \ } while (0) /** * wait_event_lock_irq - sleep until a condition gets true. The * condition is checked under the lock. This * is expected to be called with the lock * taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the * @condition evaluates to true. The @condition is checked each time * the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. */ #define wait_event_lock_irq(wq_head, condition, lock) \ do { \ if (condition) \ break; \ __wait_event_lock_irq(wq_head, condition, lock, ); \ } while (0) #define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ spin_lock_irq(&lock)) /** * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. * The condition is checked under the lock. This is expected to * be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before cmd and * schedule() and reacquired afterwards. * @cmd: a command which is invoked outside the critical section before * sleep * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or a signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before invoking the cmd and going to sleep and is reacquired * afterwards. * * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ #define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq_head, \ condition, lock, cmd); \ __ret; \ }) /** * wait_event_interruptible_lock_irq - sleep until a condition gets true. * The condition is checked under the lock. This is expected * to be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. * * The macro will return -ERESTARTSYS if it was interrupted by a signal * and 0 if @condition evaluated to true. */ #define wait_event_interruptible_lock_irq(wq_head, condition, lock) \ ({ \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq_head, \ condition, lock,); \ __ret; \ }) #define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ state, 0, timeout, \ spin_unlock_irq(&lock); \ __ret = schedule_timeout(__ret); \ spin_lock_irq(&lock)); /** * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets * true or a timeout elapses. The condition is checked under * the lock. This is expected to be called with the lock taken. * @wq_head: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() * and reacquired afterwards. * @timeout: timeout, in jiffies * * The process is put to sleep (TASK_INTERRUPTIBLE) until the * @condition evaluates to true or signal is received. The @condition is * checked each time the waitqueue @wq_head is woken up. * * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * * This is supposed to be called while holding the lock. The lock is * dropped before going to sleep and is reacquired afterwards. * * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it * was interrupted by a signal, and the remaining jiffies otherwise * if the condition evaluated to true before the timeout elapsed. */ #define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock, \ timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_lock_irq_timeout( \ wq_head, condition, lock, timeout, \ TASK_INTERRUPTIBLE); \ __ret; \ }) #define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout) \ ({ \ long __ret = timeout; \ if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_lock_irq_timeout( \ wq_head, condition, lock, timeout, \ TASK_UNINTERRUPTIBLE); \ __ret; \ }) /* * Waitqueues which are removed from the waitqueue_head at wakeup time */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_FUNC(name, function) \ struct wait_queue_entry name = { \ .private = current, \ .func = function, \ .entry = LIST_HEAD_INIT((name).entry), \ } #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) #define init_wait(wait) \ do { \ (wait)->private = current; \ (wait)->func = autoremove_wake_function; \ INIT_LIST_HEAD(&(wait)->entry); \ (wait)->flags = 0; \ } while (0) typedef int (*task_call_f)(struct task_struct *p, void *arg); extern int task_call_func(struct task_struct *p, task_call_f func, void *arg); #endif /* _LINUX_WAIT_H */
130 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel reference Implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel reference Implementation * * Various protocol defined structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * randall@sctp.chicago.il.us * kmorneau@cisco.com * qxie1@email.mot.com * Sridhar Samudrala <sri@us.ibm.com> * Kevin Gao <kevin.gao@intel.com> * * Any bugs reported given to us we will try to fix... any fixes shared will * be incorporated into the next SCTP release. */ #ifndef __LINUX_SCTP_H__ #define __LINUX_SCTP_H__ #include <linux/in.h> /* We need in_addr. */ #include <linux/in6.h> /* We need in6_addr. */ #include <linux/skbuff.h> #include <uapi/linux/sctp.h> /* Section 3.1. SCTP Common Header Format */ struct sctphdr { __be16 source; __be16 dest; __be32 vtag; __le32 checksum; }; static inline struct sctphdr *sctp_hdr(const struct sk_buff *skb) { return (struct sctphdr *)skb_transport_header(skb); } /* Section 3.2. Chunk Field Descriptions. */ struct sctp_chunkhdr { __u8 type; __u8 flags; __be16 length; }; /* Section 3.2. Chunk Type Values. * [Chunk Type] identifies the type of information contained in the Chunk * Value field. It takes a value from 0 to 254. The value of 255 is * reserved for future use as an extension field. */ enum sctp_cid { SCTP_CID_DATA = 0, SCTP_CID_INIT = 1, SCTP_CID_INIT_ACK = 2, SCTP_CID_SACK = 3, SCTP_CID_HEARTBEAT = 4, SCTP_CID_HEARTBEAT_ACK = 5, SCTP_CID_ABORT = 6, SCTP_CID_SHUTDOWN = 7, SCTP_CID_SHUTDOWN_ACK = 8, SCTP_CID_ERROR = 9, SCTP_CID_COOKIE_ECHO = 10, SCTP_CID_COOKIE_ACK = 11, SCTP_CID_ECN_ECNE = 12, SCTP_CID_ECN_CWR = 13, SCTP_CID_SHUTDOWN_COMPLETE = 14, /* AUTH Extension Section 4.1 */ SCTP_CID_AUTH = 0x0F, /* sctp ndata 5.1. I-DATA */ SCTP_CID_I_DATA = 0x40, /* PR-SCTP Sec 3.2 */ SCTP_CID_FWD_TSN = 0xC0, /* Use hex, as defined in ADDIP sec. 3.1 */ SCTP_CID_ASCONF = 0xC1, SCTP_CID_I_FWD_TSN = 0xC2, SCTP_CID_ASCONF_ACK = 0x80, SCTP_CID_RECONF = 0x82, SCTP_CID_PAD = 0x84, }; /* enum */ /* Section 3.2 * Chunk Types are encoded such that the highest-order two bits specify * the action that must be taken if the processing endpoint does not * recognize the Chunk Type. */ enum { SCTP_CID_ACTION_DISCARD = 0x00, SCTP_CID_ACTION_DISCARD_ERR = 0x40, SCTP_CID_ACTION_SKIP = 0x80, SCTP_CID_ACTION_SKIP_ERR = 0xc0, }; enum { SCTP_CID_ACTION_MASK = 0xc0, }; /* This flag is used in Chunk Flags for ABORT and SHUTDOWN COMPLETE. * * 3.3.7 Abort Association (ABORT) (6): * The T bit is set to 0 if the sender had a TCB that it destroyed. * If the sender did not have a TCB it should set this bit to 1. */ enum { SCTP_CHUNK_FLAG_T = 0x01 }; /* * Set the T bit * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 14 |Reserved |T| Length = 4 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bits * * Reserved: 7 bits * Set to 0 on transmit and ignored on receipt. * * T bit: 1 bit * The T bit is set to 0 if the sender had a TCB that it destroyed. If * the sender did NOT have a TCB it should set this bit to 1. * * Note: Special rules apply to this chunk for verification, please * see Section 8.5.1 for details. */ #define sctp_test_T_bit(c) ((c)->chunk_hdr->flags & SCTP_CHUNK_FLAG_T) /* RFC 2960 * Section 3.2.1 Optional/Variable-length Parmaeter Format. */ struct sctp_paramhdr { __be16 type; __be16 length; }; enum sctp_param { /* RFC 2960 Section 3.3.5 */ SCTP_PARAM_HEARTBEAT_INFO = cpu_to_be16(1), /* RFC 2960 Section 3.3.2.1 */ SCTP_PARAM_IPV4_ADDRESS = cpu_to_be16(5), SCTP_PARAM_IPV6_ADDRESS = cpu_to_be16(6), SCTP_PARAM_STATE_COOKIE = cpu_to_be16(7), SCTP_PARAM_UNRECOGNIZED_PARAMETERS = cpu_to_be16(8), SCTP_PARAM_COOKIE_PRESERVATIVE = cpu_to_be16(9), SCTP_PARAM_HOST_NAME_ADDRESS = cpu_to_be16(11), SCTP_PARAM_SUPPORTED_ADDRESS_TYPES = cpu_to_be16(12), SCTP_PARAM_ECN_CAPABLE = cpu_to_be16(0x8000), /* AUTH Extension Section 3 */ SCTP_PARAM_RANDOM = cpu_to_be16(0x8002), SCTP_PARAM_CHUNKS = cpu_to_be16(0x8003), SCTP_PARAM_HMAC_ALGO = cpu_to_be16(0x8004), /* Add-IP: Supported Extensions, Section 4.2 */ SCTP_PARAM_SUPPORTED_EXT = cpu_to_be16(0x8008), /* PR-SCTP Sec 3.1 */ SCTP_PARAM_FWD_TSN_SUPPORT = cpu_to_be16(0xc000), /* Add-IP Extension. Section 3.2 */ SCTP_PARAM_ADD_IP = cpu_to_be16(0xc001), SCTP_PARAM_DEL_IP = cpu_to_be16(0xc002), SCTP_PARAM_ERR_CAUSE = cpu_to_be16(0xc003), SCTP_PARAM_SET_PRIMARY = cpu_to_be16(0xc004), SCTP_PARAM_SUCCESS_REPORT = cpu_to_be16(0xc005), SCTP_PARAM_ADAPTATION_LAYER_IND = cpu_to_be16(0xc006), /* RE-CONFIG. Section 4 */ SCTP_PARAM_RESET_OUT_REQUEST = cpu_to_be16(0x000d), SCTP_PARAM_RESET_IN_REQUEST = cpu_to_be16(0x000e), SCTP_PARAM_RESET_TSN_REQUEST = cpu_to_be16(0x000f), SCTP_PARAM_RESET_RESPONSE = cpu_to_be16(0x0010), SCTP_PARAM_RESET_ADD_OUT_STREAMS = cpu_to_be16(0x0011), SCTP_PARAM_RESET_ADD_IN_STREAMS = cpu_to_be16(0x0012), }; /* enum */ /* RFC 2960 Section 3.2.1 * The Parameter Types are encoded such that the highest-order two bits * specify the action that must be taken if the processing endpoint does * not recognize the Parameter Type. * */ enum { SCTP_PARAM_ACTION_DISCARD = cpu_to_be16(0x0000), SCTP_PARAM_ACTION_DISCARD_ERR = cpu_to_be16(0x4000), SCTP_PARAM_ACTION_SKIP = cpu_to_be16(0x8000), SCTP_PARAM_ACTION_SKIP_ERR = cpu_to_be16(0xc000), }; enum { SCTP_PARAM_ACTION_MASK = cpu_to_be16(0xc000), }; /* RFC 2960 Section 3.3.1 Payload Data (DATA) (0) */ struct sctp_datahdr { __be32 tsn; __be16 stream; __be16 ssn; __u32 ppid; /* __u8 payload[]; */ }; struct sctp_data_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_datahdr data_hdr; }; struct sctp_idatahdr { __be32 tsn; __be16 stream; __be16 reserved; __be32 mid; union { __u32 ppid; __be32 fsn; }; __u8 payload[0]; }; struct sctp_idata_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_idatahdr data_hdr; }; /* DATA Chuck Specific Flags */ enum { SCTP_DATA_MIDDLE_FRAG = 0x00, SCTP_DATA_LAST_FRAG = 0x01, SCTP_DATA_FIRST_FRAG = 0x02, SCTP_DATA_NOT_FRAG = 0x03, SCTP_DATA_UNORDERED = 0x04, SCTP_DATA_SACK_IMM = 0x08, }; enum { SCTP_DATA_FRAG_MASK = 0x03, }; /* RFC 2960 Section 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two * endpoints. */ struct sctp_inithdr { __be32 init_tag; __be32 a_rwnd; __be16 num_outbound_streams; __be16 num_inbound_streams; __be32 initial_tsn; /* __u8 params[]; */ }; struct sctp_init_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_inithdr init_hdr; }; /* Section 3.3.2.1. IPv4 Address Parameter (5) */ struct sctp_ipv4addr_param { struct sctp_paramhdr param_hdr; struct in_addr addr; }; /* Section 3.3.2.1. IPv6 Address Parameter (6) */ struct sctp_ipv6addr_param { struct sctp_paramhdr param_hdr; struct in6_addr addr; }; /* Section 3.3.2.1 Cookie Preservative (9) */ struct sctp_cookie_preserve_param { struct sctp_paramhdr param_hdr; __be32 lifespan_increment; }; /* Section 3.3.2.1 Host Name Address (11) */ struct sctp_hostname_param { struct sctp_paramhdr param_hdr; uint8_t hostname[]; }; /* Section 3.3.2.1 Supported Address Types (12) */ struct sctp_supported_addrs_param { struct sctp_paramhdr param_hdr; __be16 types[]; }; /* ADDIP Section 3.2.6 Adaptation Layer Indication */ struct sctp_adaptation_ind_param { struct sctp_paramhdr param_hdr; __be32 adaptation_ind; }; /* ADDIP Section 4.2.7 Supported Extensions Parameter */ struct sctp_supported_ext_param { struct sctp_paramhdr param_hdr; __u8 chunks[]; }; /* AUTH Section 3.1 Random */ struct sctp_random_param { struct sctp_paramhdr param_hdr; __u8 random_val[]; }; /* AUTH Section 3.2 Chunk List */ struct sctp_chunks_param { struct sctp_paramhdr param_hdr; __u8 chunks[]; }; /* AUTH Section 3.3 HMAC Algorithm */ struct sctp_hmac_algo_param { struct sctp_paramhdr param_hdr; __be16 hmac_ids[]; }; /* RFC 2960. Section 3.3.3 Initiation Acknowledgement (INIT ACK) (2): * The INIT ACK chunk is used to acknowledge the initiation of an SCTP * association. */ struct sctp_initack_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_inithdr init_hdr; }; /* Section 3.3.3.1 State Cookie (7) */ struct sctp_cookie_param { struct sctp_paramhdr p; __u8 body[]; }; /* Section 3.3.3.1 Unrecognized Parameters (8) */ struct sctp_unrecognized_param { struct sctp_paramhdr param_hdr; struct sctp_paramhdr unrecognized; }; /* * 3.3.4 Selective Acknowledgement (SACK) (3): * * This chunk is sent to the peer endpoint to acknowledge received DATA * chunks and to inform the peer endpoint of gaps in the received * subsequences of DATA chunks as represented by their TSNs. */ struct sctp_gap_ack_block { __be16 start; __be16 end; }; union sctp_sack_variable { struct sctp_gap_ack_block gab; __be32 dup; }; struct sctp_sackhdr { __be32 cum_tsn_ack; __be32 a_rwnd; __be16 num_gap_ack_blocks; __be16 num_dup_tsns; /* union sctp_sack_variable variable[]; */ }; struct sctp_sack_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_sackhdr sack_hdr; }; /* RFC 2960. Section 3.3.5 Heartbeat Request (HEARTBEAT) (4): * * An endpoint should send this chunk to its peer endpoint to probe the * reachability of a particular destination transport address defined in * the present association. */ struct sctp_heartbeathdr { struct sctp_paramhdr info; }; struct sctp_heartbeat_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_heartbeathdr hb_hdr; }; /* PAD chunk could be bundled with heartbeat chunk to probe pmtu */ struct sctp_pad_chunk { struct sctp_chunkhdr uh; }; /* For the abort and shutdown ACK we must carry the init tag in the * common header. Just the common header is all that is needed with a * chunk descriptor. */ struct sctp_abort_chunk { struct sctp_chunkhdr uh; }; /* For the graceful shutdown we must carry the tag (in common header) * and the highest consecutive acking value. */ struct sctp_shutdownhdr { __be32 cum_tsn_ack; }; struct sctp_shutdown_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_shutdownhdr shutdown_hdr; }; /* RFC 2960. Section 3.3.10 Operation Error (ERROR) (9) */ struct sctp_errhdr { __be16 cause; __be16 length; /* __u8 variable[]; */ }; struct sctp_operr_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_errhdr err_hdr; }; /* RFC 2960 3.3.10 - Operation Error * * Cause Code: 16 bits (unsigned integer) * * Defines the type of error conditions being reported. * Cause Code * Value Cause Code * --------- ---------------- * 1 Invalid Stream Identifier * 2 Missing Mandatory Parameter * 3 Stale Cookie Error * 4 Out of Resource * 5 Unresolvable Address * 6 Unrecognized Chunk Type * 7 Invalid Mandatory Parameter * 8 Unrecognized Parameters * 9 No User Data * 10 Cookie Received While Shutting Down */ enum sctp_error { SCTP_ERROR_NO_ERROR = cpu_to_be16(0x00), SCTP_ERROR_INV_STRM = cpu_to_be16(0x01), SCTP_ERROR_MISS_PARAM = cpu_to_be16(0x02), SCTP_ERROR_STALE_COOKIE = cpu_to_be16(0x03), SCTP_ERROR_NO_RESOURCE = cpu_to_be16(0x04), SCTP_ERROR_DNS_FAILED = cpu_to_be16(0x05), SCTP_ERROR_UNKNOWN_CHUNK = cpu_to_be16(0x06), SCTP_ERROR_INV_PARAM = cpu_to_be16(0x07), SCTP_ERROR_UNKNOWN_PARAM = cpu_to_be16(0x08), SCTP_ERROR_NO_DATA = cpu_to_be16(0x09), SCTP_ERROR_COOKIE_IN_SHUTDOWN = cpu_to_be16(0x0a), /* SCTP Implementation Guide: * 11 Restart of an association with new addresses * 12 User Initiated Abort * 13 Protocol Violation * 14 Restart of an Association with New Encapsulation Port */ SCTP_ERROR_RESTART = cpu_to_be16(0x0b), SCTP_ERROR_USER_ABORT = cpu_to_be16(0x0c), SCTP_ERROR_PROTO_VIOLATION = cpu_to_be16(0x0d), SCTP_ERROR_NEW_ENCAP_PORT = cpu_to_be16(0x0e), /* ADDIP Section 3.3 New Error Causes * * Four new Error Causes are added to the SCTP Operational Errors, * primarily for use in the ASCONF-ACK chunk. * * Value Cause Code * --------- ---------------- * 0x00A0 Request to Delete Last Remaining IP Address. * 0x00A1 Operation Refused Due to Resource Shortage. * 0x00A2 Request to Delete Source IP Address. * 0x00A3 Association Aborted due to illegal ASCONF-ACK * 0x00A4 Request refused - no authorization. */ SCTP_ERROR_DEL_LAST_IP = cpu_to_be16(0x00A0), SCTP_ERROR_RSRC_LOW = cpu_to_be16(0x00A1), SCTP_ERROR_DEL_SRC_IP = cpu_to_be16(0x00A2), SCTP_ERROR_ASCONF_ACK = cpu_to_be16(0x00A3), SCTP_ERROR_REQ_REFUSED = cpu_to_be16(0x00A4), /* AUTH Section 4. New Error Cause * * This section defines a new error cause that will be sent if an AUTH * chunk is received with an unsupported HMAC identifier. * illustrates the new error cause. * * Cause Code Error Cause Name * -------------------------------------------------------------- * 0x0105 Unsupported HMAC Identifier */ SCTP_ERROR_UNSUP_HMAC = cpu_to_be16(0x0105) }; /* RFC 2960. Appendix A. Explicit Congestion Notification. * Explicit Congestion Notification Echo (ECNE) (12) */ struct sctp_ecnehdr { __be32 lowest_tsn; }; struct sctp_ecne_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_ecnehdr ence_hdr; }; /* RFC 2960. Appendix A. Explicit Congestion Notification. * Congestion Window Reduced (CWR) (13) */ struct sctp_cwrhdr { __be32 lowest_tsn; }; /* PR-SCTP * 3.2 Forward Cumulative TSN Chunk Definition (FORWARD TSN) * * Forward Cumulative TSN chunk has the following format: * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 192 | Flags = 0x00 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | New Cumulative TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream-1 | Stream Sequence-1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream-N | Stream Sequence-N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: * * Set to all zeros on transmit and ignored on receipt. * * New Cumulative TSN: 32 bit u_int * * This indicates the new cumulative TSN to the data receiver. Upon * the reception of this value, the data receiver MUST consider * any missing TSNs earlier than or equal to this value as received * and stop reporting them as gaps in any subsequent SACKs. * * Stream-N: 16 bit u_int * * This field holds a stream number that was skipped by this * FWD-TSN. * * Stream Sequence-N: 16 bit u_int * This field holds the sequence number associated with the stream * that was skipped. The stream sequence field holds the largest stream * sequence number in this stream being skipped. The receiver of * the FWD-TSN's can use the Stream-N and Stream Sequence-N fields * to enable delivery of any stranded TSN's that remain on the stream * re-ordering queues. This field MUST NOT report TSN's corresponding * to DATA chunk that are marked as unordered. For ordered DATA * chunks this field MUST be filled in. */ struct sctp_fwdtsn_skip { __be16 stream; __be16 ssn; }; struct sctp_fwdtsn_hdr { __be32 new_cum_tsn; /* struct sctp_fwdtsn_skip skip[]; */ }; struct sctp_fwdtsn_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_fwdtsn_hdr fwdtsn_hdr; }; struct sctp_ifwdtsn_skip { __be16 stream; __u8 reserved; __u8 flags; __be32 mid; }; struct sctp_ifwdtsn_hdr { __be32 new_cum_tsn; /* struct sctp_ifwdtsn_skip skip[]; */ }; struct sctp_ifwdtsn_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_ifwdtsn_hdr fwdtsn_hdr; }; /* ADDIP * Section 3.1.1 Address Configuration Change Chunk (ASCONF) * * Serial Number: 32 bits (unsigned integer) * This value represents a Serial Number for the ASCONF Chunk. The * valid range of Serial Number is from 0 to 2^32-1. * Serial Numbers wrap back to 0 after reaching 2^32 -1. * * Address Parameter: 8 or 20 bytes (depending on type) * The address is an address of the sender of the ASCONF chunk, * the address MUST be considered part of the association by the * peer endpoint. This field may be used by the receiver of the * ASCONF to help in finding the association. This parameter MUST * be present in every ASCONF message i.e. it is a mandatory TLV * parameter. * * ASCONF Parameter: TLV format * Each Address configuration change is represented by a TLV * parameter as defined in Section 3.2. One or more requests may * be present in an ASCONF Chunk. * * Section 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) * * Serial Number: 32 bits (unsigned integer) * This value represents the Serial Number for the received ASCONF * Chunk that is acknowledged by this chunk. This value is copied * from the received ASCONF Chunk. * * ASCONF Parameter Response: TLV format * The ASCONF Parameter Response is used in the ASCONF-ACK to * report status of ASCONF processing. */ struct sctp_addip_param { struct sctp_paramhdr param_hdr; __be32 crr_id; }; struct sctp_addiphdr { __be32 serial; /* __u8 params[]; */ }; struct sctp_addip_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_addiphdr addip_hdr; }; /* AUTH * Section 4.1 Authentication Chunk (AUTH) * * This chunk is used to hold the result of the HMAC calculation. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x0F | Flags=0 | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Shared Key Identifier | HMAC Identifier | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * \ HMAC / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Type: 1 byte (unsigned integer) * This value MUST be set to 0x0F for all AUTH-chunks. * * Flags: 1 byte (unsigned integer) * Set to zero on transmit and ignored on receipt. * * Length: 2 bytes (unsigned integer) * This value holds the length of the HMAC in bytes plus 8. * * Shared Key Identifier: 2 bytes (unsigned integer) * This value describes which endpoint pair shared key is used. * * HMAC Identifier: 2 bytes (unsigned integer) * This value describes which message digest is being used. Table 2 * shows the currently defined values. * * The following Table 2 shows the currently defined values for HMAC * identifiers. * * +-----------------+--------------------------+ * | HMAC Identifier | Message Digest Algorithm | * +-----------------+--------------------------+ * | 0 | Reserved | * | 1 | SHA-1 defined in [8] | * | 2 | Reserved | * | 3 | SHA-256 defined in [8] | * +-----------------+--------------------------+ * * * HMAC: n bytes (unsigned integer) This hold the result of the HMAC * calculation. */ struct sctp_authhdr { __be16 shkey_id; __be16 hmac_id; /* __u8 hmac[]; */ }; struct sctp_auth_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_authhdr auth_hdr; }; struct sctp_infox { struct sctp_info *sctpinfo; struct sctp_association *asoc; }; struct sctp_reconf_chunk { struct sctp_chunkhdr chunk_hdr; /* __u8 params[]; */ }; struct sctp_strreset_outreq { struct sctp_paramhdr param_hdr; __be32 request_seq; __be32 response_seq; __be32 send_reset_at_tsn; __be16 list_of_streams[]; }; struct sctp_strreset_inreq { struct sctp_paramhdr param_hdr; __be32 request_seq; __be16 list_of_streams[]; }; struct sctp_strreset_tsnreq { struct sctp_paramhdr param_hdr; __be32 request_seq; }; struct sctp_strreset_addstrm { struct sctp_paramhdr param_hdr; __be32 request_seq; __be16 number_of_streams; __be16 reserved; }; enum { SCTP_STRRESET_NOTHING_TO_DO = 0x00, SCTP_STRRESET_PERFORMED = 0x01, SCTP_STRRESET_DENIED = 0x02, SCTP_STRRESET_ERR_WRONG_SSN = 0x03, SCTP_STRRESET_ERR_IN_PROGRESS = 0x04, SCTP_STRRESET_ERR_BAD_SEQNO = 0x05, SCTP_STRRESET_IN_PROGRESS = 0x06, }; struct sctp_strreset_resp { struct sctp_paramhdr param_hdr; __be32 response_seq; __be32 result; }; struct sctp_strreset_resptsn { struct sctp_paramhdr param_hdr; __be32 response_seq; __be32 result; __be32 senders_next_tsn; __be32 receivers_next_tsn; }; enum { SCTP_DSCP_SET_MASK = 0x1, SCTP_DSCP_VAL_MASK = 0xfc, SCTP_FLOWLABEL_SET_MASK = 0x100000, SCTP_FLOWLABEL_VAL_MASK = 0xfffff }; /* UDP Encapsulation * draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.html#section-4-4 * * The error cause indicating an "Restart of an Association with * New Encapsulation Port" * * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cause Code = 14 | Cause Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Current Encapsulation Port | New Encapsulation Port | * +-------------------------------+-------------------------------+ */ struct sctp_new_encap_port_hdr { __be16 cur_port; __be16 new_port; }; /* Round an int up to the next multiple of 4. */ #define SCTP_PAD4(s) (((s)+3)&~3) /* Truncate to the previous multiple of 4. */ #define SCTP_TRUNC4(s) ((s)&~3) #endif /* __LINUX_SCTP_H__ */
309 309 307 309 309 41 41 41 41 41 24 29 29 18 302 304 254 303 304 304 304 304 304 304 199 149 149 304 304 309 244 308 308 309 16 304 305 304 25 308 308 5 11 255 208 208 255 66 64 66 5 2 1 64 63 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/bfind.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Search routines for btrees */ #include <linux/slab.h> #include "hfsplus_fs.h" int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) { void *ptr; fd->tree = tree; fd->bnode = NULL; ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); mutex_lock_nested(&tree->tree_lock, hfsplus_btree_lock_class(tree)); return 0; } void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd, int *begin, int *end, int *cur_rec) { __be32 cur_cnid; __be32 search_cnid; if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { cur_cnid = fd->key->ext.cnid; search_cnid = fd->search_key->ext.cnid; } else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) { cur_cnid = fd->key->cat.parent; search_cnid = fd->search_key->cat.parent; } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { cur_cnid = fd->key->attr.cnid; search_cnid = fd->search_key->attr.cnid; } else { cur_cnid = 0; /* used-uninitialized warning */ search_cnid = 0; BUG(); } if (cur_cnid == search_cnid) { (*end) = (*cur_rec); if ((*begin) == (*end)) return 1; } else { if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid)) (*begin) = (*cur_rec) + 1; else (*end) = (*cur_rec) - 1; } return 0; } int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd, int *begin, int *end, int *cur_rec) { int cmpval; cmpval = bnode->tree->keycmp(fd->key, fd->search_key); if (!cmpval) { (*end) = (*cur_rec); return 1; } if (cmpval < 0) (*begin) = (*cur_rec) + 1; else *(end) = (*cur_rec) - 1; return 0; } /* Find the record in bnode that best matches key (not greater than...)*/ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, search_strategy_t rec_found) { u16 off, len, keylen; int rec; int b, e; int res; BUG_ON(!rec_found); b = 0; e = bnode->num_recs - 1; res = -ENOENT; do { rec = (e + b) / 2; len = hfs_brec_lenoff(bnode, rec, &off); keylen = hfs_brec_keylen(bnode, rec); if (keylen == 0) { res = -EINVAL; goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); if (rec_found(bnode, fd, &b, &e, &rec)) { res = 0; goto done; } } while (b <= e); if (rec != e && e >= 0) { len = hfs_brec_lenoff(bnode, e, &off); keylen = hfs_brec_keylen(bnode, e); if (keylen == 0) { res = -EINVAL; goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); } done: fd->record = e; fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; fail: return res; } /* Traverse a B*Tree from the root to a leaf finding best fit to key */ /* Return allocated copy of node found, set recnum to best record */ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) { struct hfs_btree *tree; struct hfs_bnode *bnode; u32 nidx, parent; __be32 data; int height, res; tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); fd->bnode = NULL; nidx = tree->root; if (!nidx) return -ENOENT; height = tree->depth; res = 0; parent = 0; for (;;) { bnode = hfs_bnode_find(tree, nidx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; break; } if (bnode->height != height) goto invalid; if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF)) goto invalid; bnode->parent = parent; res = __hfs_brec_find(bnode, fd, do_key_compare); if (!height) break; if (fd->record < 0) goto release; parent = nidx; hfs_bnode_read(bnode, &data, fd->entryoffset, 4); nidx = be32_to_cpu(data); hfs_bnode_put(bnode); } fd->bnode = bnode; return res; invalid: pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: hfs_bnode_put(bnode); return res; } int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) { int res; res = hfs_brec_find(fd, hfs_find_rec_by_key); if (res) return res; if (fd->entrylength > rec_len) return -EINVAL; hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength); return 0; } int hfs_brec_goto(struct hfs_find_data *fd, int cnt) { struct hfs_btree *tree; struct hfs_bnode *bnode; int idx, res = 0; u16 off, len, keylen; bnode = fd->bnode; tree = bnode->tree; if (cnt < 0) { cnt = -cnt; while (cnt > fd->record) { cnt -= fd->record + 1; fd->record = bnode->num_recs - 1; idx = bnode->prev; if (!idx) { res = -ENOENT; goto out; } hfs_bnode_put(bnode); bnode = hfs_bnode_find(tree, idx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; goto out; } } fd->record -= cnt; } else { while (cnt >= bnode->num_recs - fd->record) { cnt -= bnode->num_recs - fd->record; fd->record = 0; idx = bnode->next; if (!idx) { res = -ENOENT; goto out; } hfs_bnode_put(bnode); bnode = hfs_bnode_find(tree, idx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; goto out; } } fd->record += cnt; } len = hfs_brec_lenoff(bnode, fd->record, &off); keylen = hfs_brec_keylen(bnode, fd->record); if (keylen == 0) { res = -EINVAL; goto out; } fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; hfs_bnode_read(bnode, fd->key, off, keylen); out: fd->bnode = bnode; return res; }
21 1 11 11 10 10 24 24 24 24 24 24 10 10 10 10 10 10 10 10 7 7 7 11 10 10 10 10 10 10 10 10 7 7 7 7 7 10 10 10 10 1 5 5 5 5 3 4 4 2 2 2 4 4 1 4 2 2 3 3 2 1 8 8 8 5 5 3 3 8 3 2 2 8 3 3 3 3 3 3 1 2 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 // SPDX-License-Identifier: GPL-2.0-only /* * Page Attribute Table (PAT) support: handle memory caching attributes in page tables. * * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> * Suresh B Siddha <suresh.b.siddha@intel.com> * * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. * * Basic principles: * * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and * the kernel to set one of a handful of 'caching type' attributes for physical * memory ranges: uncached, write-combining, write-through, write-protected, * and the most commonly used and default attribute: write-back caching. * * PAT support supersedes and augments MTRR support in a compatible fashion: MTRR is * a hardware interface to enumerate a limited number of physical memory ranges * and set their caching attributes explicitly, programmed into the CPU via MSRs. * Even modern CPUs have MTRRs enabled - but these are typically not touched * by the kernel or by user-space (such as the X server), we rely on PAT for any * additional cache attribute logic. * * PAT doesn't work via explicit memory ranges, but uses page table entries to add * cache attribute information to the mapped memory range: there's 3 bits used, * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT). * * ( There's a metric ton of finer details, such as compatibility with CPU quirks * that only support 4 types of PAT entries, and interaction with MTRRs, see * below for details. ) */ #include <linux/seq_file.h> #include <linux/memblock.h> #include <linux/debugfs.h> #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/pfn_t.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/fs.h> #include <linux/rbtree.h> #include <asm/cacheflush.h> #include <asm/cacheinfo.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/x86_init.h> #include <asm/fcntl.h> #include <asm/e820/api.h> #include <asm/mtrr.h> #include <asm/page.h> #include <asm/msr.h> #include <asm/memtype.h> #include <asm/io.h> #include "memtype.h" #include "../mm_internal.h" #undef pr_fmt #define pr_fmt(fmt) "" fmt static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); static u64 __ro_after_init pat_msr_val; /* * PAT support is enabled by default, but can be disabled for * various user-requested or hardware-forced reasons: */ static void __init pat_disable(const char *msg_reason) { if (pat_disabled) return; pat_disabled = true; pr_info("x86/PAT: %s\n", msg_reason); memory_caching_control &= ~CACHE_PAT; } static int __init nopat(char *str) { pat_disable("PAT support disabled via boot option."); return 0; } early_param("nopat", nopat); bool pat_enabled(void) { return !pat_disabled; } EXPORT_SYMBOL_GPL(pat_enabled); int pat_debug_enable; static int __init pat_debug_setup(char *str) { pat_debug_enable = 1; return 1; } __setup("debugpat", pat_debug_setup); #ifdef CONFIG_X86_PAT /* * X86 PAT uses page flags arch_1 and arch_2 together to keep track of * memory type of pages that have backing page struct. * * X86 PAT supports 4 different memory types: * - _PAGE_CACHE_MODE_WB * - _PAGE_CACHE_MODE_WC * - _PAGE_CACHE_MODE_UC_MINUS * - _PAGE_CACHE_MODE_WT * * _PAGE_CACHE_MODE_WB is the default type. */ #define _PGMT_WB 0 #define _PGMT_WC (1UL << PG_arch_1) #define _PGMT_UC_MINUS (1UL << PG_arch_2) #define _PGMT_WT (1UL << PG_arch_2 | 1UL << PG_arch_1) #define _PGMT_MASK (1UL << PG_arch_2 | 1UL << PG_arch_1) #define _PGMT_CLEAR_MASK (~_PGMT_MASK) static inline enum page_cache_mode get_page_memtype(struct page *pg) { unsigned long pg_flags = pg->flags & _PGMT_MASK; if (pg_flags == _PGMT_WB) return _PAGE_CACHE_MODE_WB; else if (pg_flags == _PGMT_WC) return _PAGE_CACHE_MODE_WC; else if (pg_flags == _PGMT_UC_MINUS) return _PAGE_CACHE_MODE_UC_MINUS; else return _PAGE_CACHE_MODE_WT; } static inline void set_page_memtype(struct page *pg, enum page_cache_mode memtype) { unsigned long memtype_flags; unsigned long old_flags; unsigned long new_flags; switch (memtype) { case _PAGE_CACHE_MODE_WC: memtype_flags = _PGMT_WC; break; case _PAGE_CACHE_MODE_UC_MINUS: memtype_flags = _PGMT_UC_MINUS; break; case _PAGE_CACHE_MODE_WT: memtype_flags = _PGMT_WT; break; case _PAGE_CACHE_MODE_WB: default: memtype_flags = _PGMT_WB; break; } old_flags = READ_ONCE(pg->flags); do { new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags)); } #else static inline enum page_cache_mode get_page_memtype(struct page *pg) { return -1; } static inline void set_page_memtype(struct page *pg, enum page_cache_mode memtype) { } #endif #define CM(c) (_PAGE_CACHE_MODE_ ## c) static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, char *msg) { enum page_cache_mode cache; char *cache_mode; switch (pat_val) { case X86_MEMTYPE_UC: cache = CM(UC); cache_mode = "UC "; break; case X86_MEMTYPE_WC: cache = CM(WC); cache_mode = "WC "; break; case X86_MEMTYPE_WT: cache = CM(WT); cache_mode = "WT "; break; case X86_MEMTYPE_WP: cache = CM(WP); cache_mode = "WP "; break; case X86_MEMTYPE_WB: cache = CM(WB); cache_mode = "WB "; break; case X86_MEMTYPE_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; default: cache = CM(WB); cache_mode = "WB "; break; } memcpy(msg, cache_mode, 4); return cache; } #undef CM /* * Update the cache mode to pgprot translation tables according to PAT * configuration. * Using lower indices is preferred, so we start with highest index. */ static void __init init_cache_modes(u64 pat) { enum page_cache_mode cache; char pat_msg[33]; int i; pat_msg[32] = 0; for (i = 7; i >= 0; i--) { cache = pat_get_cache_mode((pat >> (i * 8)) & 7, pat_msg + 4 * i); update_cache_mode_entry(i, cache); } pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); } void pat_cpu_init(void) { if (!boot_cpu_has(X86_FEATURE_PAT)) { /* * If this happens we are on a secondary CPU, but switched to * PAT on the boot CPU. We have no way to undo PAT. */ panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); } wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); __flush_tlb_all(); } /** * pat_bp_init - Initialize the PAT MSR value and PAT table * * This function initializes PAT MSR value and PAT table with an OS-defined * value to enable additional cache attributes, WC, WT and WP. * * This function prepares the calls of pat_cpu_init() via cache_cpu_init() * on all CPUs. */ void __init pat_bp_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; if (!IS_ENABLED(CONFIG_X86_PAT)) pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); if (!cpu_feature_enabled(X86_FEATURE_PAT)) pat_disable("PAT not supported by the CPU."); else rdmsrl(MSR_IA32_CR_PAT, pat_msr_val); if (!pat_msr_val) { pat_disable("PAT support disabled by the firmware."); /* * No PAT. Emulate the PAT table that corresponds to the two * cache bits, PWT (Write Through) and PCD (Cache Disable). * This setup is also the same as the BIOS default setup. * * PTE encoding: * * PCD * |PWT PAT * || slot * 00 0 WB : _PAGE_CACHE_MODE_WB * 01 1 WT : _PAGE_CACHE_MODE_WT * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 11 3 UC : _PAGE_CACHE_MODE_UC * * NOTE: When WC or WP is used, it is redirected to UC- per * the default setup in __cachemode2pte_tbl[]. */ pat_msr_val = PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); } /* * Xen PV doesn't allow to set PAT MSR, but all cache modes are * supported. */ if (pat_disabled || cpu_feature_enabled(X86_FEATURE_XENPV)) { init_cache_modes(pat_msr_val); return; } if ((c->x86_vendor == X86_VENDOR_INTEL) && (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { /* * PAT support with the lower four entries. Intel Pentium 2, * 3, M, and 4 are affected by PAT errata, which makes the * upper four entries unusable. To be on the safe side, we don't * use those. * * PTE encoding: * PAT * |PCD * ||PWT PAT * ||| slot * 000 0 WB : _PAGE_CACHE_MODE_WB * 001 1 WC : _PAGE_CACHE_MODE_WC * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * PAT bit unused * * NOTE: When WT or WP is used, it is redirected to UC- per * the default setup in __cachemode2pte_tbl[]. */ pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); } else { /* * Full PAT support. We put WT in slot 7 to improve * robustness in the presence of errata that might cause * the high PAT bit to be ignored. This way, a buggy slot 7 * access will hit slot 3, and slot 3 is UC, so at worst * we lose performance without causing a correctness issue. * Pentium 4 erratum N46 is an example for such an erratum, * although we try not to use PAT at all on affected CPUs. * * PTE encoding: * PAT * |PCD * ||PWT PAT * ||| slot * 000 0 WB : _PAGE_CACHE_MODE_WB * 001 1 WC : _PAGE_CACHE_MODE_WC * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * 100 4 WB : Reserved * 101 5 WP : _PAGE_CACHE_MODE_WP * 110 6 UC-: Reserved * 111 7 WT : _PAGE_CACHE_MODE_WT * * The reserved slots are unused, but mapped to their * corresponding types in the presence of PAT errata. */ pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); } memory_caching_control |= CACHE_PAT; init_cache_modes(pat_msr_val); } static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ /* * Does intersection of PAT memory type and MTRR memory type and returns * the resulting memory type as PAT understands it. * (Type in pat and mtrr will not have same value) * The intersection is based on "Effective Memory Type" tables in IA-32 * SDM vol 3a */ static unsigned long pat_x_mtrr_type(u64 start, u64 end, enum page_cache_mode req_type) { /* * Look for MTRR hint to get the effective type in case where PAT * request is for WB. */ if (req_type == _PAGE_CACHE_MODE_WB) { u8 mtrr_type, uniform; mtrr_type = mtrr_type_lookup(start, end, &uniform); if (mtrr_type != MTRR_TYPE_WRBACK) return _PAGE_CACHE_MODE_UC_MINUS; return _PAGE_CACHE_MODE_WB; } return req_type; } struct pagerange_state { unsigned long cur_pfn; int ram; int not_ram; }; static int pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) { struct pagerange_state *state = arg; state->not_ram |= initial_pfn > state->cur_pfn; state->ram |= total_nr_pages > 0; state->cur_pfn = initial_pfn + total_nr_pages; return state->ram && state->not_ram; } static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { int ret = 0; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; struct pagerange_state state = {start_pfn, 0, 0}; /* * For legacy reasons, physical address range in the legacy ISA * region is tracked as non-RAM. This will allow users of * /dev/mem to map portions of legacy ISA region, even when * some of those portions are listed(or not even listed) with * different e820 types(RAM/reserved/..) */ if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; if (start_pfn < end_pfn) { ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &state, pagerange_is_ram_callback); } return (ret > 0) ? -1 : (state.ram ? 1 : 0); } /* * For RAM pages, we use page flags to mark the pages with appropriate type. * The page flags are limited to four types, WB (default), WC, WT and UC-. * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting * a new memory type is only allowed for a page mapped with the default WB * type. * * Here we do two passes: * - Find the memtype of all the pages in the range, look for any conflicts. * - In case of no conflicts, set the new memtype for pages in the range. */ static int reserve_ram_pages_type(u64 start, u64 end, enum page_cache_mode req_type, enum page_cache_mode *new_type) { struct page *page; u64 pfn; if (req_type == _PAGE_CACHE_MODE_WP) { if (new_type) *new_type = _PAGE_CACHE_MODE_UC_MINUS; return -EINVAL; } if (req_type == _PAGE_CACHE_MODE_UC) { /* We do not support strong UC */ WARN_ON_ONCE(1); req_type = _PAGE_CACHE_MODE_UC_MINUS; } for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { enum page_cache_mode type; page = pfn_to_page(pfn); type = get_page_memtype(page); if (type != _PAGE_CACHE_MODE_WB) { pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", start, end - 1, type, req_type); if (new_type) *new_type = type; return -EBUSY; } } if (new_type) *new_type = req_type; for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); set_page_memtype(page, req_type); } return 0; } static int free_ram_pages_type(u64 start, u64 end) { struct page *page; u64 pfn; for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); set_page_memtype(page, _PAGE_CACHE_MODE_WB); } return 0; } static u64 sanitize_phys(u64 address) { /* * When changing the memtype for pages containing poison allow * for a "decoy" virtual address (bit 63 clear) passed to * set_memory_X(). __pa() on a "decoy" address results in a * physical address with bit 63 set. * * Decoy addresses are not present for 32-bit builds, see * set_mce_nospec(). */ if (IS_ENABLED(CONFIG_X86_64)) return address & __PHYSICAL_MASK; return address; } /* * req_type typically has one of the: * - _PAGE_CACHE_MODE_WB * - _PAGE_CACHE_MODE_WC * - _PAGE_CACHE_MODE_UC_MINUS * - _PAGE_CACHE_MODE_UC * - _PAGE_CACHE_MODE_WT * * If new_type is NULL, function will return an error if it cannot reserve the * region with req_type. If new_type is non-NULL, function will return * available type in new_type in case of no error. In case of any error * it will return a negative return value. */ int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type, enum page_cache_mode *new_type) { struct memtype *entry_new; enum page_cache_mode actual_type; int is_range_ram; int err = 0; start = sanitize_phys(start); /* * The end address passed into this function is exclusive, but * sanitize_phys() expects an inclusive address. */ end = sanitize_phys(end - 1) + 1; if (start >= end) { WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, start, end - 1, cattr_name(req_type)); return -EINVAL; } if (!pat_enabled()) { /* This is identical to page table setting without PAT */ if (new_type) *new_type = req_type; return 0; } /* Low ISA region is always mapped WB in page table. No need to track */ if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) *new_type = _PAGE_CACHE_MODE_WB; return 0; } /* * Call mtrr_lookup to get the type hint. This is an * optimization for /dev/mem mmap'ers into WB memory (BIOS * tools and ACPI tools). Use WB request for WB memory and use * UC_MINUS otherwise. */ actual_type = pat_x_mtrr_type(start, end, req_type); if (new_type) *new_type = actual_type; is_range_ram = pat_pagerange_is_ram(start, end); if (is_range_ram == 1) { err = reserve_ram_pages_type(start, end, req_type, new_type); return err; } else if (is_range_ram < 0) { return -EINVAL; } entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL); if (!entry_new) return -ENOMEM; entry_new->start = start; entry_new->end = end; entry_new->type = actual_type; spin_lock(&memtype_lock); err = memtype_check_insert(entry_new, new_type); if (err) { pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n", start, end - 1, cattr_name(entry_new->type), cattr_name(req_type)); kfree(entry_new); spin_unlock(&memtype_lock); return err; } spin_unlock(&memtype_lock); dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", start, end - 1, cattr_name(entry_new->type), cattr_name(req_type), new_type ? cattr_name(*new_type) : "-"); return err; } int memtype_free(u64 start, u64 end) { int is_range_ram; struct memtype *entry_old; if (!pat_enabled()) return 0; start = sanitize_phys(start); end = sanitize_phys(end); /* Low ISA region is always mapped WB. No need to track */ if (x86_platform.is_untracked_pat_range(start, end)) return 0; is_range_ram = pat_pagerange_is_ram(start, end); if (is_range_ram == 1) return free_ram_pages_type(start, end); if (is_range_ram < 0) return -EINVAL; spin_lock(&memtype_lock); entry_old = memtype_erase(start, end); spin_unlock(&memtype_lock); if (IS_ERR(entry_old)) { pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, start, end - 1); return -EINVAL; } kfree(entry_old); dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1); return 0; } /** * lookup_memtype - Looks up the memory type for a physical address * @paddr: physical address of which memory type needs to be looked up * * Only to be called when PAT is enabled * * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS * or _PAGE_CACHE_MODE_WT. */ static enum page_cache_mode lookup_memtype(u64 paddr) { enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; struct memtype *entry; if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) return rettype; if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { struct page *page; page = pfn_to_page(paddr >> PAGE_SHIFT); return get_page_memtype(page); } spin_lock(&memtype_lock); entry = memtype_lookup(paddr); if (entry != NULL) rettype = entry->type; else rettype = _PAGE_CACHE_MODE_UC_MINUS; spin_unlock(&memtype_lock); return rettype; } /** * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type * of @pfn cannot be overridden by UC MTRR memory type. * * Only to be called when PAT is enabled. * * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. * Returns false in other cases. */ bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) { enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); return cm == _PAGE_CACHE_MODE_UC || cm == _PAGE_CACHE_MODE_UC_MINUS || cm == _PAGE_CACHE_MODE_WC; } EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); /** * memtype_reserve_io - Request a memory type mapping for a region of memory * @start: start (physical address) of the region * @end: end (physical address) of the region * @type: A pointer to memtype, with requested type. On success, requested * or any other compatible type that was available for the region is returned * * On success, returns 0 * On failure, returns non-zero */ int memtype_reserve_io(resource_size_t start, resource_size_t end, enum page_cache_mode *type) { resource_size_t size = end - start; enum page_cache_mode req_type = *type; enum page_cache_mode new_type; int ret; WARN_ON_ONCE(iomem_map_sanity_check(start, size)); ret = memtype_reserve(start, end, req_type, &new_type); if (ret) goto out_err; if (!is_new_memtype_allowed(start, size, req_type, new_type)) goto out_free; if (memtype_kernel_map_sync(start, size, new_type) < 0) goto out_free; *type = new_type; return 0; out_free: memtype_free(start, end); ret = -EBUSY; out_err: return ret; } /** * memtype_free_io - Release a memory type mapping for a region of memory * @start: start (physical address) of the region * @end: end (physical address) of the region */ void memtype_free_io(resource_size_t start, resource_size_t end) { memtype_free(start, end); } #ifdef CONFIG_X86_PAT int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) { enum page_cache_mode type = _PAGE_CACHE_MODE_WC; return memtype_reserve_io(start, start + size, &type); } EXPORT_SYMBOL(arch_io_reserve_memtype_wc); void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) { memtype_free_io(start, start + size); } EXPORT_SYMBOL(arch_io_free_memtype_wc); #endif pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) vma_prot = pgprot_decrypted(vma_prot); return vma_prot; } #ifdef CONFIG_STRICT_DEVMEM /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { return 1; } #else /* This check is needed to avoid cache aliasing when PAT is enabled */ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { u64 from = ((u64)pfn) << PAGE_SHIFT; u64 to = from + size; u64 cursor = from; if (!pat_enabled()) return 1; while (cursor < to) { if (!devmem_is_allowed(pfn)) return 0; cursor += PAGE_SIZE; pfn++; } return 1; } #endif /* CONFIG_STRICT_DEVMEM */ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; if (!range_is_allowed(pfn, size)) return 0; if (file->f_flags & O_DSYNC) pcm = _PAGE_CACHE_MODE_UC_MINUS; *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | cachemode2protval(pcm)); return 1; } /* * Change the memory type for the physical address range in kernel identity * mapping space if that range is a part of identity map. */ int memtype_kernel_map_sync(u64 base, unsigned long size, enum page_cache_mode pcm) { unsigned long id_sz; if (base > __pa(high_memory-1)) return 0; /* * Some areas in the middle of the kernel identity range * are not mapped, for example the PCI space. */ if (!page_is_ram(base >> PAGE_SHIFT)) return 0; id_sz = (__pa(high_memory-1) <= base + size) ? __pa(high_memory) - base : size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(pcm), base, (unsigned long long)(base + size-1)); return -EINVAL; } return 0; } /* * Internal interface to reserve a range of physical memory with prot. * Reserved non RAM regions only and after successful memtype_reserve, * this func also keeps identity mapping (if any) in sync with this new prot. */ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, int strict_prot) { int is_ram = 0; int ret; enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); enum page_cache_mode pcm = want_pcm; is_ram = pat_pagerange_is_ram(paddr, paddr + size); /* * reserve_pfn_range() for RAM pages. We do not refcount to keep * track of number of mappings of RAM pages. We can assert that * the type requested matches the type of first page in the range. */ if (is_ram) { if (!pat_enabled()) return 0; pcm = lookup_memtype(paddr); if (want_pcm != pcm) { pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_pcm), (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), cattr_name(pcm)); *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); } return 0; } ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm); if (ret) return ret; if (pcm != want_pcm) { if (strict_prot || !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { memtype_free(paddr, paddr + size); pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_pcm), (unsigned long long)paddr, (unsigned long long)(paddr + size - 1), cattr_name(pcm)); return -EINVAL; } /* * We allow returning different type than the one requested in * non strict case. */ *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); } if (memtype_kernel_map_sync(paddr, size, pcm) < 0) { memtype_free(paddr, paddr + size); return -EINVAL; } return 0; } /* * Internal interface to free a range of physical memory. * Frees non RAM regions only. */ static void free_pfn_range(u64 paddr, unsigned long size) { int is_ram; is_ram = pat_pagerange_is_ram(paddr, paddr + size); if (is_ram == 0) memtype_free(paddr, paddr + size); } static int follow_phys(struct vm_area_struct *vma, unsigned long *prot, resource_size_t *phys) { struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start }; if (follow_pfnmap_start(&args)) return -EINVAL; /* Never return PFNs of anon folios in COW mappings. */ if (!args.special) { follow_pfnmap_end(&args); return -EINVAL; } *prot = pgprot_val(args.pgprot); *phys = (resource_size_t)args.pfn << PAGE_SHIFT; follow_pfnmap_end(&args); return 0; } static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, pgprot_t *pgprot) { unsigned long prot; VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT)); /* * We need the starting PFN and cachemode used for track_pfn_remap() * that covered the whole VMA. For most mappings, we can obtain that * information from the page tables. For COW mappings, we might now * suddenly have anon folios mapped and follow_phys() will fail. * * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to * detect the PFN. If we need the cachemode as well, we're out of luck * for now and have to fail fork(). */ if (!follow_phys(vma, &prot, paddr)) { if (pgprot) *pgprot = __pgprot(prot); return 0; } if (is_cow_mapping(vma->vm_flags)) { if (pgprot) return -EINVAL; *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return 0; } WARN_ON_ONCE(1); return -EINVAL; } /* * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). * * If the vma has a linear pfn mapping for the entire range, we get the prot * from pte and reserve the entire vma range with single reserve_pfn_range call. */ int track_pfn_copy(struct vm_area_struct *vma) { resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; if (vma->vm_flags & VM_PAT) { if (get_pat_info(vma, &paddr, &pgprot)) return -EINVAL; /* reserve the whole chunk covered by vma. */ return reserve_pfn_range(paddr, vma_size, &pgprot, 1); } return 0; } /* * prot is passed in as a parameter for the new mapping. If the vma has * a linear pfn mapping for the entire range, or no vma is provided, * reserve the entire pfn + size range with single reserve_pfn_range * call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) { resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ if (!vma || (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start))) { int ret; ret = reserve_pfn_range(paddr, size, prot, 0); if (ret == 0 && vma) vm_flags_set(vma, VM_PAT); return ret; } if (!pat_enabled()) return 0; /* * For anything smaller than the vma size we set prot based on the * lookup. */ pcm = lookup_memtype(paddr); /* Check memtype for the remaining pages */ while (size > PAGE_SIZE) { size -= PAGE_SIZE; paddr += PAGE_SIZE; if (pcm != lookup_memtype(paddr)) return -EINVAL; } *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); return 0; } void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) { enum page_cache_mode pcm; if (!pat_enabled()) return; /* Set prot based on lookup */ pcm = lookup_memtype(pfn_t_to_phys(pfn)); *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); } /* * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked) { resource_size_t paddr; if (vma && !(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ paddr = (resource_size_t)pfn << PAGE_SHIFT; if (!paddr && !size) { if (get_pat_info(vma, &paddr, NULL)) return; size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); if (vma) { if (mm_wr_locked) vm_flags_clear(vma, VM_PAT); else __vm_flags_mod(vma, 0, VM_PAT); } } /* * untrack_pfn_clear is called if the following situation fits: * * 1) while mremapping a pfnmap for a new region, with the old vma after * its pfnmap page table has been removed. The new vma has a new pfnmap * to the same pfn & cache type with VM_PAT set. * 2) while duplicating vm area, the new vma fails to copy the pgtable from * old vma. */ void untrack_pfn_clear(struct vm_area_struct *vma) { vm_flags_clear(vma, VM_PAT); } pgprot_t pgprot_writecombine(pgprot_t prot) { return __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WC)); } EXPORT_SYMBOL_GPL(pgprot_writecombine); pgprot_t pgprot_writethrough(pgprot_t prot) { return __pgprot(pgprot_val(prot) | cachemode2protval(_PAGE_CACHE_MODE_WT)); } EXPORT_SYMBOL_GPL(pgprot_writethrough); #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* * We are allocating a temporary printout-entry to be passed * between seq_start()/next() and seq_show(): */ static struct memtype *memtype_get_idx(loff_t pos) { struct memtype *entry_print; int ret; entry_print = kzalloc(sizeof(struct memtype), GFP_KERNEL); if (!entry_print) return NULL; spin_lock(&memtype_lock); ret = memtype_copy_nth_element(entry_print, pos); spin_unlock(&memtype_lock); /* Free it on error: */ if (ret) { kfree(entry_print); return NULL; } return entry_print; } static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos == 0) { ++*pos; seq_puts(seq, "PAT memtype list:\n"); } return memtype_get_idx(*pos); } static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { kfree(v); ++*pos; return memtype_get_idx(*pos); } static void memtype_seq_stop(struct seq_file *seq, void *v) { kfree(v); } static int memtype_seq_show(struct seq_file *seq, void *v) { struct memtype *entry_print = (struct memtype *)v; seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n", entry_print->start, entry_print->end, cattr_name(entry_print->type)); return 0; } static const struct seq_operations memtype_seq_ops = { .start = memtype_seq_start, .next = memtype_seq_next, .stop = memtype_seq_stop, .show = memtype_seq_show, }; static int memtype_seq_open(struct inode *inode, struct file *file) { return seq_open(file, &memtype_seq_ops); } static const struct file_operations memtype_fops = { .open = memtype_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static int __init pat_memtype_list_init(void) { if (pat_enabled()) { debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, NULL, &memtype_fops); } return 0; } late_initcall(pat_memtype_list_init); #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 396 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 // SPDX-License-Identifier: GPL-2.0-or-later /* * tsacct.c - System accounting over taskstats interface * * Copyright (C) Jay Lan, <jlan@sgi.com> */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/sched/cputime.h> #include <linux/tsacct_kern.h> #include <linux/acct.h> #include <linux/jiffies.h> #include <linux/mm.h> /* * fill in basic accounting fields */ void bacct_add_tsk(struct user_namespace *user_ns, struct pid_namespace *pid_ns, struct taskstats *stats, struct task_struct *tsk) { const struct cred *tcred; u64 utime, stime, utimescaled, stimescaled; u64 now_ns, delta; time64_t btime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); /* calculate task elapsed time in nsec */ now_ns = ktime_get_ns(); /* store whole group time first */ delta = now_ns - tsk->group_leader->start_time; /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); stats->ac_tgetime = delta; delta = now_ns - tsk->start_time; do_div(delta, NSEC_PER_USEC); stats->ac_etime = delta; /* Convert to seconds for btime (note y2106 limit) */ btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC); stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); stats->ac_btime64 = btime; if (tsk->flags & PF_EXITING) stats->ac_exitcode = tsk->exit_code; if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC)) stats->ac_flag |= AFORK; if (tsk->flags & PF_SUPERPRIV) stats->ac_flag |= ASU; if (tsk->flags & PF_DUMPCORE) stats->ac_flag |= ACORE; if (tsk->flags & PF_SIGNALED) stats->ac_flag |= AXSIG; stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); stats->ac_ppid = pid_alive(tsk) ? task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); task_cputime(tsk, &utime, &stime); stats->ac_utime = div_u64(utime, NSEC_PER_USEC); stats->ac_stime = div_u64(stime, NSEC_PER_USEC); task_cputime_scaled(tsk, &utimescaled, &stimescaled); stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC); stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC); stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; strscpy_pad(stats->ac_comm, tsk->comm); } #ifdef CONFIG_TASK_XACCT #define KB 1024 #define MB (1024*KB) #define KB_MASK (~(KB-1)) /* * fill in extended accounting fields */ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; do_div(stats->coremem, 1000 * KB); stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; do_div(stats->virtmem, 1000 * KB); mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } stats->read_char = p->ioac.rchar & KB_MASK; stats->write_char = p->ioac.wchar & KB_MASK; stats->read_syscalls = p->ioac.syscr & KB_MASK; stats->write_syscalls = p->ioac.syscw & KB_MASK; #ifdef CONFIG_TASK_IO_ACCOUNTING stats->read_bytes = p->ioac.read_bytes & KB_MASK; stats->write_bytes = p->ioac.write_bytes & KB_MASK; stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK; #else stats->read_bytes = 0; stats->write_bytes = 0; stats->cancelled_write_bytes = 0; #endif } #undef KB #undef MB static void __acct_update_integrals(struct task_struct *tsk, u64 utime, u64 stime) { u64 time, delta; if (!likely(tsk->mm)) return; time = stime + utime; delta = time - tsk->acct_timexpd; if (delta < TICK_NSEC) return; tsk->acct_timexpd = time; /* * Divide by 1024 to avoid overflow, and to avoid division. * The final unit reported to userspace is Mbyte-usecs, * the rest of the math is done in xacct_add_tsk. */ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10; } /** * acct_update_integrals - update mm integral fields in task_struct * @tsk: task_struct for accounting */ void acct_update_integrals(struct task_struct *tsk) { u64 utime, stime; unsigned long flags; local_irq_save(flags); task_cputime(tsk, &utime, &stime); __acct_update_integrals(tsk, utime, stime); local_irq_restore(flags); } /** * acct_account_cputime - update mm integral after cputime update * @tsk: task_struct for accounting */ void acct_account_cputime(struct task_struct *tsk) { __acct_update_integrals(tsk, tsk->utime, tsk->stime); } /** * acct_clear_integrals - clear the mm integral fields in task_struct * @tsk: task_struct whose accounting fields are cleared */ void acct_clear_integrals(struct task_struct *tsk) { tsk->acct_timexpd = 0; tsk->acct_rss_mem1 = 0; tsk->acct_vm_mem1 = 0; } #endif
1 2 2 1 1 1 1 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2013 Andrew Duggan <aduggan@synaptics.com> * Copyright (c) 2013 Synaptics Incorporated * Copyright (c) 2014 Benjamin Tissoires <benjamin.tissoires@gmail.com> * Copyright (c) 2014 Red Hat, Inc */ #include <linux/kernel.h> #include <linux/hid.h> #include <linux/input.h> #include <linux/input/mt.h> #include <linux/irq.h> #include <linux/irqdomain.h> #include <linux/module.h> #include <linux/pm.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/sched.h> #include <linux/rmi.h> #include "hid-ids.h" #define RMI_MOUSE_REPORT_ID 0x01 /* Mouse emulation Report */ #define RMI_WRITE_REPORT_ID 0x09 /* Output Report */ #define RMI_READ_ADDR_REPORT_ID 0x0a /* Output Report */ #define RMI_READ_DATA_REPORT_ID 0x0b /* Input Report */ #define RMI_ATTN_REPORT_ID 0x0c /* Input Report */ #define RMI_SET_RMI_MODE_REPORT_ID 0x0f /* Feature Report */ /* flags */ #define RMI_READ_REQUEST_PENDING 0 #define RMI_READ_DATA_PENDING 1 #define RMI_STARTED 2 /* device flags */ #define RMI_DEVICE BIT(0) #define RMI_DEVICE_HAS_PHYS_BUTTONS BIT(1) #define RMI_DEVICE_OUTPUT_SET_REPORT BIT(2) /* * retrieve the ctrl registers * the ctrl register has a size of 20 but a fw bug split it into 16 + 4, * and there is no way to know if the first 20 bytes are here or not. * We use only the first 12 bytes, so get only them. */ #define RMI_F11_CTRL_REG_COUNT 12 enum rmi_mode_type { RMI_MODE_OFF = 0, RMI_MODE_ATTN_REPORTS = 1, RMI_MODE_NO_PACKED_ATTN_REPORTS = 2, }; /** * struct rmi_data - stores information for hid communication * * @page_mutex: Locks current page to avoid changing pages in unexpected ways. * @page: Keeps track of the current virtual page * @xport: transport device to be registered with the RMI4 core. * * @wait: Used for waiting for read data * * @writeReport: output buffer when writing RMI registers * @readReport: input buffer when reading RMI registers * * @input_report_size: size of an input report (advertised by HID) * @output_report_size: size of an output report (advertised by HID) * * @flags: flags for the current device (started, reading, etc...) * * @reset_work: worker which will be called in case of a mouse report * @hdev: pointer to the struct hid_device * * @device_flags: flags which describe the device * * @domain: the IRQ domain allocated for this RMI4 device * @rmi_irq: the irq that will be used to generate events to rmi-core */ struct rmi_data { struct mutex page_mutex; int page; struct rmi_transport_dev xport; wait_queue_head_t wait; u8 *writeReport; u8 *readReport; u32 input_report_size; u32 output_report_size; unsigned long flags; struct work_struct reset_work; struct hid_device *hdev; unsigned long device_flags; struct irq_domain *domain; int rmi_irq; }; #define RMI_PAGE(addr) (((addr) >> 8) & 0xff) static int rmi_write_report(struct hid_device *hdev, u8 *report, int len); /** * rmi_set_page - Set RMI page * @hdev: The pointer to the hid_device struct * @page: The new page address. * * RMI devices have 16-bit addressing, but some of the physical * implementations (like SMBus) only have 8-bit addressing. So RMI implements * a page address at 0xff of every page so we can reliable page addresses * every 256 registers. * * The page_mutex lock must be held when this function is entered. * * Returns zero on success, non-zero on failure. */ static int rmi_set_page(struct hid_device *hdev, u8 page) { struct rmi_data *data = hid_get_drvdata(hdev); int retval; data->writeReport[0] = RMI_WRITE_REPORT_ID; data->writeReport[1] = 1; data->writeReport[2] = 0xFF; data->writeReport[4] = page; retval = rmi_write_report(hdev, data->writeReport, data->output_report_size); if (retval != data->output_report_size) { dev_err(&hdev->dev, "%s: set page failed: %d.", __func__, retval); return retval; } data->page = page; return 0; } static int rmi_set_mode(struct hid_device *hdev, u8 mode) { int ret; const u8 txbuf[2] = {RMI_SET_RMI_MODE_REPORT_ID, mode}; u8 *buf; buf = kmemdup(txbuf, sizeof(txbuf), GFP_KERNEL); if (!buf) return -ENOMEM; ret = hid_hw_raw_request(hdev, RMI_SET_RMI_MODE_REPORT_ID, buf, sizeof(txbuf), HID_FEATURE_REPORT, HID_REQ_SET_REPORT); kfree(buf); if (ret < 0) { dev_err(&hdev->dev, "unable to set rmi mode to %d (%d)\n", mode, ret); return ret; } return 0; } static int rmi_write_report(struct hid_device *hdev, u8 *report, int len) { struct rmi_data *data = hid_get_drvdata(hdev); int ret; if (data->device_flags & RMI_DEVICE_OUTPUT_SET_REPORT) { /* * Talk to device by using SET_REPORT requests instead. */ ret = hid_hw_raw_request(hdev, report[0], report, len, HID_OUTPUT_REPORT, HID_REQ_SET_REPORT); } else { ret = hid_hw_output_report(hdev, (void *)report, len); } if (ret < 0) { dev_err(&hdev->dev, "failed to write hid report (%d)\n", ret); return ret; } return ret; } static int rmi_hid_read_block(struct rmi_transport_dev *xport, u16 addr, void *buf, size_t len) { struct rmi_data *data = container_of(xport, struct rmi_data, xport); struct hid_device *hdev = data->hdev; int ret; int bytes_read; int bytes_needed; int retries; int read_input_count; mutex_lock(&data->page_mutex); if (RMI_PAGE(addr) != data->page) { ret = rmi_set_page(hdev, RMI_PAGE(addr)); if (ret < 0) goto exit; } for (retries = 5; retries > 0; retries--) { data->writeReport[0] = RMI_READ_ADDR_REPORT_ID; data->writeReport[1] = 0; /* old 1 byte read count */ data->writeReport[2] = addr & 0xFF; data->writeReport[3] = (addr >> 8) & 0xFF; data->writeReport[4] = len & 0xFF; data->writeReport[5] = (len >> 8) & 0xFF; set_bit(RMI_READ_REQUEST_PENDING, &data->flags); ret = rmi_write_report(hdev, data->writeReport, data->output_report_size); if (ret != data->output_report_size) { dev_err(&hdev->dev, "failed to write request output report (%d)\n", ret); goto exit; } bytes_read = 0; bytes_needed = len; while (bytes_read < len) { if (!wait_event_timeout(data->wait, test_bit(RMI_READ_DATA_PENDING, &data->flags), msecs_to_jiffies(1000))) { hid_warn(hdev, "%s: timeout elapsed\n", __func__); ret = -EAGAIN; break; } read_input_count = data->readReport[1]; memcpy(buf + bytes_read, &data->readReport[2], min(read_input_count, bytes_needed)); bytes_read += read_input_count; bytes_needed -= read_input_count; clear_bit(RMI_READ_DATA_PENDING, &data->flags); } if (ret >= 0) { ret = 0; break; } } exit: clear_bit(RMI_READ_REQUEST_PENDING, &data->flags); mutex_unlock(&data->page_mutex); return ret; } static int rmi_hid_write_block(struct rmi_transport_dev *xport, u16 addr, const void *buf, size_t len) { struct rmi_data *data = container_of(xport, struct rmi_data, xport); struct hid_device *hdev = data->hdev; int ret; mutex_lock(&data->page_mutex); if (RMI_PAGE(addr) != data->page) { ret = rmi_set_page(hdev, RMI_PAGE(addr)); if (ret < 0) goto exit; } data->writeReport[0] = RMI_WRITE_REPORT_ID; data->writeReport[1] = len; data->writeReport[2] = addr & 0xFF; data->writeReport[3] = (addr >> 8) & 0xFF; memcpy(&data->writeReport[4], buf, len); ret = rmi_write_report(hdev, data->writeReport, data->output_report_size); if (ret < 0) { dev_err(&hdev->dev, "failed to write request output report (%d)\n", ret); goto exit; } ret = 0; exit: mutex_unlock(&data->page_mutex); return ret; } static int rmi_reset_attn_mode(struct hid_device *hdev) { struct rmi_data *data = hid_get_drvdata(hdev); struct rmi_device *rmi_dev = data->xport.rmi_dev; int ret; ret = rmi_set_mode(hdev, RMI_MODE_ATTN_REPORTS); if (ret) return ret; if (test_bit(RMI_STARTED, &data->flags)) ret = rmi_dev->driver->reset_handler(rmi_dev); return ret; } static void rmi_reset_work(struct work_struct *work) { struct rmi_data *hdata = container_of(work, struct rmi_data, reset_work); /* switch the device to RMI if we receive a generic mouse report */ rmi_reset_attn_mode(hdata->hdev); } static int rmi_input_event(struct hid_device *hdev, u8 *data, int size) { struct rmi_data *hdata = hid_get_drvdata(hdev); struct rmi_device *rmi_dev = hdata->xport.rmi_dev; unsigned long flags; if (!(test_bit(RMI_STARTED, &hdata->flags))) return 0; pm_wakeup_event(hdev->dev.parent, 0); local_irq_save(flags); rmi_set_attn_data(rmi_dev, data[1], &data[2], size - 2); generic_handle_irq(hdata->rmi_irq); local_irq_restore(flags); return 1; } static int rmi_read_data_event(struct hid_device *hdev, u8 *data, int size) { struct rmi_data *hdata = hid_get_drvdata(hdev); if (!test_bit(RMI_READ_REQUEST_PENDING, &hdata->flags)) { hid_dbg(hdev, "no read request pending\n"); return 0; } memcpy(hdata->readReport, data, min((u32)size, hdata->input_report_size)); set_bit(RMI_READ_DATA_PENDING, &hdata->flags); wake_up(&hdata->wait); return 1; } static int rmi_check_sanity(struct hid_device *hdev, u8 *data, int size) { int valid_size = size; /* * On the Dell XPS 13 9333, the bus sometimes get confused and fills * the report with a sentinel value "ff". Synaptics told us that such * behavior does not comes from the touchpad itself, so we filter out * such reports here. */ while ((data[valid_size - 1] == 0xff) && valid_size > 0) valid_size--; return valid_size; } static int rmi_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct rmi_data *hdata = hid_get_drvdata(hdev); if (!(hdata->device_flags & RMI_DEVICE)) return 0; size = rmi_check_sanity(hdev, data, size); if (size < 2) return 0; switch (data[0]) { case RMI_READ_DATA_REPORT_ID: return rmi_read_data_event(hdev, data, size); case RMI_ATTN_REPORT_ID: return rmi_input_event(hdev, data, size); default: return 1; } return 0; } static int rmi_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct rmi_data *data = hid_get_drvdata(hdev); if ((data->device_flags & RMI_DEVICE) && (field->application == HID_GD_POINTER || field->application == HID_GD_MOUSE)) { if (data->device_flags & RMI_DEVICE_HAS_PHYS_BUTTONS) { if ((usage->hid & HID_USAGE_PAGE) == HID_UP_BUTTON) return 0; if ((usage->hid == HID_GD_X || usage->hid == HID_GD_Y) && !value) return 1; } schedule_work(&data->reset_work); return 1; } return 0; } static void rmi_report(struct hid_device *hid, struct hid_report *report) { struct hid_field *field = report->field[0]; if (!(hid->claimed & HID_CLAIMED_INPUT)) return; switch (report->id) { case RMI_READ_DATA_REPORT_ID: case RMI_ATTN_REPORT_ID: return; } if (field && field->hidinput && field->hidinput->input) input_sync(field->hidinput->input); } static int rmi_suspend(struct hid_device *hdev, pm_message_t message) { struct rmi_data *data = hid_get_drvdata(hdev); struct rmi_device *rmi_dev = data->xport.rmi_dev; int ret; if (!(data->device_flags & RMI_DEVICE)) return 0; ret = rmi_driver_suspend(rmi_dev, false); if (ret) { hid_warn(hdev, "Failed to suspend device: %d\n", ret); return ret; } return 0; } static int rmi_post_resume(struct hid_device *hdev) { struct rmi_data *data = hid_get_drvdata(hdev); struct rmi_device *rmi_dev = data->xport.rmi_dev; int ret; if (!(data->device_flags & RMI_DEVICE)) return 0; /* Make sure the HID device is ready to receive events */ ret = hid_hw_open(hdev); if (ret) return ret; ret = rmi_reset_attn_mode(hdev); if (ret) goto out; ret = rmi_driver_resume(rmi_dev, false); if (ret) { hid_warn(hdev, "Failed to resume device: %d\n", ret); goto out; } out: hid_hw_close(hdev); return ret; } static int rmi_hid_reset(struct rmi_transport_dev *xport, u16 reset_addr) { struct rmi_data *data = container_of(xport, struct rmi_data, xport); struct hid_device *hdev = data->hdev; return rmi_reset_attn_mode(hdev); } static int rmi_input_configured(struct hid_device *hdev, struct hid_input *hi) { struct rmi_data *data = hid_get_drvdata(hdev); struct input_dev *input = hi->input; int ret = 0; if (!(data->device_flags & RMI_DEVICE)) return 0; data->xport.input = input; hid_dbg(hdev, "Opening low level driver\n"); ret = hid_hw_open(hdev); if (ret) return ret; /* Allow incoming hid reports */ hid_device_io_start(hdev); ret = rmi_set_mode(hdev, RMI_MODE_ATTN_REPORTS); if (ret < 0) { dev_err(&hdev->dev, "failed to set rmi mode\n"); goto exit; } ret = rmi_set_page(hdev, 0); if (ret < 0) { dev_err(&hdev->dev, "failed to set page select to 0.\n"); goto exit; } ret = rmi_register_transport_device(&data->xport); if (ret < 0) { dev_err(&hdev->dev, "failed to register transport driver\n"); goto exit; } set_bit(RMI_STARTED, &data->flags); exit: hid_device_io_stop(hdev); hid_hw_close(hdev); return ret; } static int rmi_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct rmi_data *data = hid_get_drvdata(hdev); /* * we want to make HID ignore the advertised HID collection * for RMI deivces */ if (data->device_flags & RMI_DEVICE) { if ((data->device_flags & RMI_DEVICE_HAS_PHYS_BUTTONS) && ((usage->hid & HID_USAGE_PAGE) == HID_UP_BUTTON)) return 0; return -1; } return 0; } static int rmi_check_valid_report_id(struct hid_device *hdev, unsigned type, unsigned id, struct hid_report **report) { int i; *report = hdev->report_enum[type].report_id_hash[id]; if (*report) { for (i = 0; i < (*report)->maxfield; i++) { unsigned app = (*report)->field[i]->application; if ((app & HID_USAGE_PAGE) >= HID_UP_MSVENDOR) return 1; } } return 0; } static struct rmi_device_platform_data rmi_hid_pdata = { .sensor_pdata = { .sensor_type = rmi_sensor_touchpad, .axis_align.flip_y = true, .dribble = RMI_REG_STATE_ON, .palm_detect = RMI_REG_STATE_OFF, }, }; static const struct rmi_transport_ops hid_rmi_ops = { .write_block = rmi_hid_write_block, .read_block = rmi_hid_read_block, .reset = rmi_hid_reset, }; static void rmi_irq_teardown(void *data) { struct rmi_data *hdata = data; struct irq_domain *domain = hdata->domain; if (!domain) return; irq_dispose_mapping(irq_find_mapping(domain, 0)); irq_domain_remove(domain); hdata->domain = NULL; hdata->rmi_irq = 0; } static int rmi_irq_map(struct irq_domain *h, unsigned int virq, irq_hw_number_t hw_irq_num) { irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); return 0; } static const struct irq_domain_ops rmi_irq_ops = { .map = rmi_irq_map, }; static int rmi_setup_irq_domain(struct hid_device *hdev) { struct rmi_data *hdata = hid_get_drvdata(hdev); int ret; hdata->domain = irq_domain_create_linear(hdev->dev.fwnode, 1, &rmi_irq_ops, hdata); if (!hdata->domain) return -ENOMEM; ret = devm_add_action_or_reset(&hdev->dev, &rmi_irq_teardown, hdata); if (ret) return ret; hdata->rmi_irq = irq_create_mapping(hdata->domain, 0); if (hdata->rmi_irq <= 0) { hid_err(hdev, "Can't allocate an IRQ\n"); return hdata->rmi_irq < 0 ? hdata->rmi_irq : -ENXIO; } return 0; } static int rmi_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct rmi_data *data = NULL; int ret; size_t alloc_size; struct hid_report *input_report; struct hid_report *output_report; struct hid_report *feature_report; data = devm_kzalloc(&hdev->dev, sizeof(struct rmi_data), GFP_KERNEL); if (!data) return -ENOMEM; INIT_WORK(&data->reset_work, rmi_reset_work); data->hdev = hdev; hid_set_drvdata(hdev, data); hdev->quirks |= HID_QUIRK_NO_INIT_REPORTS; hdev->quirks |= HID_QUIRK_NO_INPUT_SYNC; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); return ret; } if (id->driver_data) data->device_flags = id->driver_data; /* * Check for the RMI specific report ids. If they are misisng * simply return and let the events be processed by hid-input */ if (!rmi_check_valid_report_id(hdev, HID_FEATURE_REPORT, RMI_SET_RMI_MODE_REPORT_ID, &feature_report)) { hid_dbg(hdev, "device does not have set mode feature report\n"); goto start; } if (!rmi_check_valid_report_id(hdev, HID_INPUT_REPORT, RMI_ATTN_REPORT_ID, &input_report)) { hid_dbg(hdev, "device does not have attention input report\n"); goto start; } data->input_report_size = hid_report_len(input_report); if (!rmi_check_valid_report_id(hdev, HID_OUTPUT_REPORT, RMI_WRITE_REPORT_ID, &output_report)) { hid_dbg(hdev, "device does not have rmi write output report\n"); goto start; } data->output_report_size = hid_report_len(output_report); data->device_flags |= RMI_DEVICE; alloc_size = data->output_report_size + data->input_report_size; data->writeReport = devm_kzalloc(&hdev->dev, alloc_size, GFP_KERNEL); if (!data->writeReport) { hid_err(hdev, "failed to allocate buffer for HID reports\n"); return -ENOMEM; } data->readReport = data->writeReport + data->output_report_size; init_waitqueue_head(&data->wait); mutex_init(&data->page_mutex); ret = rmi_setup_irq_domain(hdev); if (ret) { hid_err(hdev, "failed to allocate IRQ domain\n"); return ret; } if (data->device_flags & RMI_DEVICE_HAS_PHYS_BUTTONS) rmi_hid_pdata.gpio_data.disable = true; data->xport.dev = hdev->dev.parent; data->xport.pdata = rmi_hid_pdata; data->xport.pdata.irq = data->rmi_irq; data->xport.proto_name = "hid"; data->xport.ops = &hid_rmi_ops; start: ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) { hid_err(hdev, "hw start failed\n"); return ret; } return 0; } static void rmi_remove(struct hid_device *hdev) { struct rmi_data *hdata = hid_get_drvdata(hdev); if ((hdata->device_flags & RMI_DEVICE) && test_bit(RMI_STARTED, &hdata->flags)) { clear_bit(RMI_STARTED, &hdata->flags); cancel_work_sync(&hdata->reset_work); rmi_unregister_transport_device(&hdata->xport); } hid_hw_stop(hdev); } static const struct hid_device_id rmi_id[] = { { HID_USB_DEVICE(USB_VENDOR_ID_RAZER, USB_DEVICE_ID_RAZER_BLADE_14), .driver_data = RMI_DEVICE_HAS_PHYS_BUTTONS }, { HID_USB_DEVICE(USB_VENDOR_ID_LENOVO, USB_DEVICE_ID_LENOVO_X1_COVER) }, { HID_USB_DEVICE(USB_VENDOR_ID_PRIMAX, USB_DEVICE_ID_PRIMAX_REZEL) }, { HID_USB_DEVICE(USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5), .driver_data = RMI_DEVICE_OUTPUT_SET_REPORT }, { HID_DEVICE(HID_BUS_ANY, HID_GROUP_RMI, HID_ANY_ID, HID_ANY_ID) }, { } }; MODULE_DEVICE_TABLE(hid, rmi_id); static struct hid_driver rmi_driver = { .name = "hid-rmi", .id_table = rmi_id, .probe = rmi_probe, .remove = rmi_remove, .event = rmi_event, .raw_event = rmi_raw_event, .report = rmi_report, .input_mapping = rmi_input_mapping, .input_configured = rmi_input_configured, .suspend = pm_ptr(rmi_suspend), .resume = pm_ptr(rmi_post_resume), .reset_resume = pm_ptr(rmi_post_resume), }; module_hid_driver(rmi_driver); MODULE_AUTHOR("Andrew Duggan <aduggan@synaptics.com>"); MODULE_DESCRIPTION("RMI HID driver"); MODULE_LICENSE("GPL");
4 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_meta.h> #include <linux/if_bridge.h> #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ #include "../br_private.h" static const struct net_device * nft_meta_get_bridge(const struct net_device *dev) { if (dev && netif_is_bridge_port(dev)) return netdev_master_upper_dev_get_rcu((struct net_device *)dev); return NULL; } static void nft_meta_bridge_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); u32 *dest = &regs->data[priv->dreg]; const struct net_device *br_dev; switch (priv->key) { case NFT_META_BRI_IIFNAME: br_dev = nft_meta_get_bridge(in); break; case NFT_META_BRI_OIFNAME: br_dev = nft_meta_get_bridge(out); break; case NFT_META_BRI_IIFPVID: { u16 p_pvid; br_dev = nft_meta_get_bridge(in); if (!br_dev || !br_vlan_enabled(br_dev)) goto err; br_vlan_get_pvid_rcu(in, &p_pvid); nft_reg_store16(dest, p_pvid); return; } case NFT_META_BRI_IIFVPROTO: { u16 p_proto; br_dev = nft_meta_get_bridge(in); if (!br_dev || !br_vlan_enabled(br_dev)) goto err; br_vlan_get_proto(br_dev, &p_proto); nft_reg_store_be16(dest, htons(p_proto)); return; } default: return nft_meta_get_eval(expr, regs, pkt); } strscpy_pad((char *)dest, br_dev ? br_dev->name : "", IFNAMSIZ); return; err: regs->verdict.code = NFT_BREAK; } static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_BRI_IIFNAME: case NFT_META_BRI_OIFNAME: len = IFNAMSIZ; break; case NFT_META_BRI_IIFPVID: case NFT_META_BRI_IIFVPROTO: len = sizeof(u16); break; default: return nft_meta_get_init(ctx, expr, tb); } priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } static struct nft_expr_type nft_meta_bridge_type; static const struct nft_expr_ops nft_meta_bridge_get_ops = { .type = &nft_meta_bridge_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_bridge_get_eval, .init = nft_meta_bridge_get_init, .dump = nft_meta_get_dump, .reduce = nft_meta_get_reduce, }; static void nft_meta_bridge_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); u32 *sreg = &regs->data[meta->sreg]; struct sk_buff *skb = pkt->skb; u8 value8; switch (meta->key) { case NFT_META_BRI_BROUTE: value8 = nft_reg_load8(sreg); BR_INPUT_SKB_CB(skb)->br_netfilter_broute = !!value8; break; default: nft_meta_set_eval(expr, regs, pkt); } } static int nft_meta_bridge_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; int err; priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); switch (priv->key) { case NFT_META_BRI_BROUTE: len = sizeof(u8); break; default: return nft_meta_set_init(ctx, expr, tb); } priv->len = len; err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; return 0; } static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { int i; for (i = 0; i < NFT_REG32_NUM; i++) { if (!track->regs[i].selector) continue; if (track->regs[i].selector->ops != &nft_meta_bridge_get_ops) continue; __nft_reg_track_cancel(track, i); } return false; } static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; switch (priv->key) { case NFT_META_BRI_BROUTE: hooks = 1 << NF_BR_PRE_ROUTING; break; default: return nft_meta_set_validate(ctx, expr); } return nft_chain_validate_hooks(ctx->chain, hooks); } static const struct nft_expr_ops nft_meta_bridge_set_ops = { .type = &nft_meta_bridge_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), .eval = nft_meta_bridge_set_eval, .init = nft_meta_bridge_set_init, .destroy = nft_meta_set_destroy, .dump = nft_meta_set_dump, .reduce = nft_meta_bridge_set_reduce, .validate = nft_meta_bridge_set_validate, }; static const struct nft_expr_ops * nft_meta_bridge_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_META_KEY] == NULL) return ERR_PTR(-EINVAL); if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) return ERR_PTR(-EINVAL); if (tb[NFTA_META_DREG]) return &nft_meta_bridge_get_ops; if (tb[NFTA_META_SREG]) return &nft_meta_bridge_set_ops; return ERR_PTR(-EINVAL); } static struct nft_expr_type nft_meta_bridge_type __read_mostly = { .family = NFPROTO_BRIDGE, .name = "meta", .select_ops = nft_meta_bridge_select_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, }; static int __init nft_meta_bridge_module_init(void) { return nft_register_expr(&nft_meta_bridge_type); } static void __exit nft_meta_bridge_module_exit(void) { nft_unregister_expr(&nft_meta_bridge_type); } module_init(nft_meta_bridge_module_init); module_exit(nft_meta_bridge_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("wenxu <wenxu@ucloud.cn>"); MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); MODULE_DESCRIPTION("Support for bridge dedicated meta key");
8 8 8 8 5 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 5 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 // SPDX-License-Identifier: GPL-2.0-only /* * handle transition of Linux booting another kernel * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> */ #define pr_fmt(fmt) "kexec: " fmt #include <linux/mm.h> #include <linux/kexec.h> #include <linux/string.h> #include <linux/gfp.h> #include <linux/reboot.h> #include <linux/numa.h> #include <linux/ftrace.h> #include <linux/io.h> #include <linux/suspend.h> #include <linux/vmalloc.h> #include <linux/efi.h> #include <linux/cc_platform.h> #include <asm/init.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/io_apic.h> #include <asm/debugreg.h> #include <asm/kexec-bzimage64.h> #include <asm/setup.h> #include <asm/set_memory.h> #include <asm/cpu.h> #include <asm/efi.h> #ifdef CONFIG_ACPI /* * Used while adding mapping for ACPI tables. * Can be reused when other iomem regions need be mapped */ struct init_pgtable_data { struct x86_mapping_info *info; pgd_t *level4p; }; static int mem_region_callback(struct resource *res, void *arg) { struct init_pgtable_data *data = arg; return kernel_ident_mapping_init(data->info, data->level4p, res->start, res->end + 1); } static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { struct init_pgtable_data data; unsigned long flags; int ret; data.info = info; data.level4p = level4p; flags = IORESOURCE_MEM | IORESOURCE_BUSY; ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &data, mem_region_callback); if (ret && ret != -EINVAL) return ret; /* ACPI tables could be located in ACPI Non-volatile Storage region */ ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &data, mem_region_callback); if (ret && ret != -EINVAL) return ret; return 0; } #else static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } #endif #ifdef CONFIG_KEXEC_FILE const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, NULL }; #endif static int map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) { #ifdef CONFIG_EFI unsigned long mstart, mend; void *kaddr; int ret; if (!efi_enabled(EFI_BOOT)) return 0; mstart = (boot_params.efi_info.efi_systab | ((u64)boot_params.efi_info.efi_systab_hi<<32)); if (efi_enabled(EFI_64BIT)) mend = mstart + sizeof(efi_system_table_64_t); else mend = mstart + sizeof(efi_system_table_32_t); if (!mstart) return 0; ret = kernel_ident_mapping_init(info, level4p, mstart, mend); if (ret) return ret; kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB); if (!kaddr) { pr_err("Could not map UEFI system table\n"); return -ENOMEM; } mstart = efi_config_table; if (efi_enabled(EFI_64BIT)) { efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr; mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables; } else { efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr; mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables; } memunmap(kaddr); return kernel_ident_mapping_init(info, level4p, mstart, mend); #endif return 0; } static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.p4d); image->arch.p4d = NULL; free_page((unsigned long)image->arch.pud); image->arch.pud = NULL; free_page((unsigned long)image->arch.pmd); image->arch.pmd = NULL; free_page((unsigned long)image->arch.pte); image->arch.pte = NULL; } static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) { pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; unsigned long vaddr, paddr; int result = -ENOMEM; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; vaddr = (unsigned long)relocate_kernel; paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) goto err; image->arch.p4d = p4d; set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); } p4d = p4d_offset(pgd, vaddr); if (!p4d_present(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) goto err; image->arch.pud = pud; set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pud = pud_offset(p4d, vaddr); if (!pud_present(*pud)) { pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) goto err; image->arch.pmd = pmd; set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } pmd = pmd_offset(pud, vaddr); if (!pmd_present(*pmd)) { pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) goto err; image->arch.pte = pte; set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); } pte = pte_offset_kernel(pmd, vaddr); if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) prot = PAGE_KERNEL_EXEC; set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); return 0; err: return result; } static void *alloc_pgt_page(void *data) { struct kimage *image = (struct kimage *)data; struct page *page; void *p = NULL; page = kimage_alloc_control_pages(image, 0); if (page) { p = page_address(page); clear_page(p); } return p; } static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, .context = image, .page_flag = __PAGE_KERNEL_LARGE_EXEC, .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; pgd_t *level4p; int result; int i; level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |= _PAGE_ENC; info.kernpg_flag |= _PAGE_ENC; } if (direct_gbpages) info.direct_gbpages = true; for (i = 0; i < nr_pfn_mapped; i++) { mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; result = kernel_ident_mapping_init(&info, level4p, mstart, mend); if (result) return result; } /* * segments's mem ranges could be outside 0 ~ max_pfn, * for example when jump back to original kernel from kexeced kernel. * or first kernel is booted with user mem map, and second kernel * could be loaded out of that range. */ for (i = 0; i < image->nr_segments; i++) { mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; result = kernel_ident_mapping_init(&info, level4p, mstart, mend); if (result) return result; } /* * Prepare EFI systab and ACPI tables for kexec kernel since they are * not covered by pfn_mapped. */ result = map_efi_systab(&info, level4p); if (result) return result; result = map_acpi_tables(&info, level4p); if (result) return result; return init_transition_pgtable(image, level4p); } static void load_segments(void) { __asm__ __volatile__ ( "\tmovl %0,%%ds\n" "\tmovl %0,%%es\n" "\tmovl %0,%%ss\n" "\tmovl %0,%%fs\n" "\tmovl %0,%%gs\n" : : "a" (__KERNEL_DS) : "memory" ); } int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; int result; /* Calculate the offsets */ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, start_pgtable); if (result) return result; return 0; } void machine_kexec_cleanup(struct kimage *image) { free_transition_pgtable(image); } /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; unsigned int host_mem_enc_active; int save_ftrace_enabled; void *control_page; /* * This must be done before load_segments() since if call depth tracking * is used then GS must be valid to make any function calls. */ host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) save_processor_state(); #endif save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); hw_breakpoint_disable(); cet_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC /* * We need to put APICs in legacy mode so that we can * get timer interrupts in second kernel. kexec/kdump * paths already have calls to restore_boot_irq_mode() * in one form or other. kexec jump path also need one. */ clear_IO_APIC(); restore_boot_irq_mode(); #endif } control_page = page_address(image->control_code_page) + PAGE_SIZE; __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); /* * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the * descriptor table in memory accessed. * * I take advantage of this here by force loading the * segments, before I zap the gdt with an invalid value. */ load_segments(); /* * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ native_idt_invalidate(); native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, image->start, image->preserve_context, host_mem_enc_active); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) restore_processor_state(); #endif __ftrace_enabled_restore(save_ftrace_enabled); } /* arch-dependent functionality related to kexec file-based syscall */ #ifdef CONFIG_KEXEC_FILE /* * Apply purgatory relocations. * * @pi: Purgatory to be relocated. * @section: Section relocations applying to. * @relsec: Section containing RELAs. * @symtabsec: Corresponding symtab. * * TODO: Some of the code belongs to generic code. Move that in kexec.c. */ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, const Elf_Shdr *relsec, const Elf_Shdr *symtabsec) { unsigned int i; Elf64_Rela *rel; Elf64_Sym *sym; void *location; unsigned long address, sec_base, value; const char *strtab, *name, *shstrtab; const Elf_Shdr *sechdrs; /* String & section header string table */ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset; shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; rel = (void *)pi->ehdr + relsec->sh_offset; pr_debug("Applying relocate section %s to %u\n", shstrtab + relsec->sh_name, relsec->sh_info); for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) { /* * rel[i].r_offset contains byte offset from beginning * of section to the storage unit affected. * * This is location to update. This is temporary buffer * where section is currently loaded. This will finally be * loaded to a different address later, pointed to by * ->sh_addr. kexec takes care of moving it * (kexec_load_segment()). */ location = pi->purgatory_buf; location += section->sh_offset; location += rel[i].r_offset; /* Final address of the location */ address = section->sh_addr + rel[i].r_offset; /* * rel[i].r_info contains information about symbol table index * w.r.t which relocation must be made and type of relocation * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get * these respectively. */ sym = (void *)pi->ehdr + symtabsec->sh_offset; sym += ELF64_R_SYM(rel[i].r_info); if (sym->st_name) name = strtab + sym->st_name; else name = shstrtab + sechdrs[sym->st_shndx].sh_name; pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", name, sym->st_info, sym->st_shndx, sym->st_value, sym->st_size); if (sym->st_shndx == SHN_UNDEF) { pr_err("Undefined symbol: %s\n", name); return -ENOEXEC; } if (sym->st_shndx == SHN_COMMON) { pr_err("symbol '%s' in common section\n", name); return -ENOEXEC; } if (sym->st_shndx == SHN_ABS) sec_base = 0; else if (sym->st_shndx >= pi->ehdr->e_shnum) { pr_err("Invalid section %d for symbol %s\n", sym->st_shndx, name); return -ENOEXEC; } else sec_base = pi->sechdrs[sym->st_shndx].sh_addr; value = sym->st_value; value += sec_base; value += rel[i].r_addend; switch (ELF64_R_TYPE(rel[i].r_info)) { case R_X86_64_NONE: break; case R_X86_64_64: *(u64 *)location = value; break; case R_X86_64_32: *(u32 *)location = value; if (value != *(u32 *)location) goto overflow; break; case R_X86_64_32S: *(s32 *)location = value; if ((s64)value != *(s32 *)location) goto overflow; break; case R_X86_64_PC32: case R_X86_64_PLT32: value -= (u64)address; *(u32 *)location = value; break; default: pr_err("Unknown rela relocation: %llu\n", ELF64_R_TYPE(rel[i].r_info)); return -ENOEXEC; } } return 0; overflow: pr_err("Overflow in relocation type %d value 0x%lx\n", (int)ELF64_R_TYPE(rel[i].r_info), value); return -ENOEXEC; } int arch_kimage_file_post_load_cleanup(struct kimage *image) { vfree(image->elf_headers); image->elf_headers = NULL; image->elf_headers_sz = 0; return kexec_image_post_load_cleanup_default(image); } #endif /* CONFIG_KEXEC_FILE */ #ifdef CONFIG_CRASH_DUMP static int kexec_mark_range(unsigned long start, unsigned long end, bool protect) { struct page *page; unsigned int nr_pages; /* * For physical range: [start, end]. We must skip the unassigned * crashk resource with zero-valued "end" member. */ if (!end || start > end) return 0; page = pfn_to_page(start >> PAGE_SHIFT); nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; if (protect) return set_pages_ro(page, nr_pages); else return set_pages_rw(page, nr_pages); } static void kexec_mark_crashkres(bool protect) { unsigned long control; kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect); /* Don't touch the control code page used in crash_kexec().*/ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); /* Control code page is located in the 2nd page. */ kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); control += KEXEC_CONTROL_PAGE_SIZE; kexec_mark_range(control, crashk_res.end, protect); } void arch_kexec_protect_crashkres(void) { kexec_mark_crashkres(true); } void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } #endif /* * During a traditional boot under SME, SME will encrypt the kernel, * so the SME kexec kernel also needs to be un-encrypted in order to * replicate a normal SME boot. * * During a traditional boot under SEV, the kernel has already been * loaded encrypted, so the SEV kexec kernel needs to be encrypted in * order to replicate a normal SEV boot. */ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return 0; /* * If host memory encryption is active we need to be sure that kexec * pages are not encrypted because when we boot to the new kernel the * pages won't be accessed encrypted (initially). */ return set_memory_decrypted((unsigned long)vaddr, pages); } void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return; /* * If host memory encryption is active we need to reset the pages back * to being an encrypted mapping before freeing them. */ set_memory_encrypted((unsigned long)vaddr, pages); }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 7 7 7 7 7 7 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" /** * struct devlink_resource - devlink resource * @name: name of the resource * @id: id, per devlink instance * @size: size of the resource * @size_new: updated size of the resource, reload is needed * @size_valid: valid in case the total size of the resource is valid * including its children * @parent: parent resource * @size_params: size parameters * @list: parent list * @resource_list: list of child resources * @occ_get: occupancy getter callback * @occ_get_priv: occupancy getter callback priv */ struct devlink_resource { const char *name; u64 id; u64 size; u64 size_new; bool size_valid; struct devlink_resource *parent; struct devlink_resource_size_params size_params; struct list_head list; struct list_head resource_list; devlink_resource_occ_get_t *occ_get; void *occ_get_priv; }; static struct devlink_resource * devlink_resource_find(struct devlink *devlink, struct devlink_resource *resource, u64 resource_id) { struct list_head *resource_list; if (resource) resource_list = &resource->resource_list; else resource_list = &devlink->resource_list; list_for_each_entry(resource, resource_list, list) { struct devlink_resource *child_resource; if (resource->id == resource_id) return resource; child_resource = devlink_resource_find(devlink, resource, resource_id); if (child_resource) return child_resource; } return NULL; } static void devlink_resource_validate_children(struct devlink_resource *resource) { struct devlink_resource *child_resource; bool size_valid = true; u64 parts_size = 0; if (list_empty(&resource->resource_list)) goto out; list_for_each_entry(child_resource, &resource->resource_list, list) parts_size += child_resource->size_new; if (parts_size > resource->size_new) size_valid = false; out: resource->size_valid = size_valid; } static int devlink_resource_validate_size(struct devlink_resource *resource, u64 size, struct netlink_ext_ack *extack) { u64 reminder; int err = 0; if (size > resource->size_params.size_max) { NL_SET_ERR_MSG(extack, "Size larger than maximum"); err = -EINVAL; } if (size < resource->size_params.size_min) { NL_SET_ERR_MSG(extack, "Size smaller than minimum"); err = -EINVAL; } div64_u64_rem(size, resource->size_params.size_granularity, &reminder); if (reminder) { NL_SET_ERR_MSG(extack, "Wrong granularity"); err = -EINVAL; } return err; } int devlink_nl_resource_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_resource *resource; u64 resource_id; u64 size; int err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) || GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE)) return -EINVAL; resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); resource = devlink_resource_find(devlink, NULL, resource_id); if (!resource) return -EINVAL; size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); err = devlink_resource_validate_size(resource, size, info->extack); if (err) return err; resource->size_new = size; devlink_resource_validate_children(resource); if (resource->parent) devlink_resource_validate_children(resource->parent); return 0; } static int devlink_resource_size_params_put(struct devlink_resource *resource, struct sk_buff *skb) { struct devlink_resource_size_params *size_params; size_params = &resource->size_params; if (devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, size_params->size_granularity) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, size_params->size_max) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, size_params->size_min) || nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit)) return -EMSGSIZE; return 0; } static int devlink_resource_occ_put(struct devlink_resource *resource, struct sk_buff *skb) { if (!resource->occ_get) return 0; return devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_OCC, resource->occ_get(resource->occ_get_priv)); } static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, struct devlink_resource *resource) { struct devlink_resource *child_resource; struct nlattr *child_resource_attr; struct nlattr *resource_attr; resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE); if (!resource_attr) return -EMSGSIZE; if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size) || devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id)) goto nla_put_failure; if (resource->size != resource->size_new && devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, resource->size_new)) goto nla_put_failure; if (devlink_resource_occ_put(resource, skb)) goto nla_put_failure; if (devlink_resource_size_params_put(resource, skb)) goto nla_put_failure; if (list_empty(&resource->resource_list)) goto out; if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, resource->size_valid)) goto nla_put_failure; child_resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE_LIST); if (!child_resource_attr) goto nla_put_failure; list_for_each_entry(child_resource, &resource->resource_list, list) { if (devlink_resource_put(devlink, skb, child_resource)) goto resource_put_failure; } nla_nest_end(skb, child_resource_attr); out: nla_nest_end(skb, resource_attr); return 0; resource_put_failure: nla_nest_cancel(skb, child_resource_attr); nla_put_failure: nla_nest_cancel(skb, resource_attr); return -EMSGSIZE; } static int devlink_resource_fill(struct genl_info *info, enum devlink_command cmd, int flags) { struct devlink *devlink = info->user_ptr[0]; struct devlink_resource *resource; struct nlattr *resources_attr; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; bool incomplete; void *hdr; int i; int err; resource = list_first_entry(&devlink->resource_list, struct devlink_resource, list); start_again: err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &devlink_nl_family, NLM_F_MULTI, cmd); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (devlink_nl_put_handle(skb, devlink)) goto nla_put_failure; resources_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE_LIST); if (!resources_attr) goto nla_put_failure; incomplete = false; i = 0; list_for_each_entry_from(resource, &devlink->resource_list, list) { err = devlink_resource_put(devlink, skb, resource); if (err) { if (!i) goto err_resource_put; incomplete = true; break; } i++; } nla_nest_end(skb, resources_attr); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = devlink_nl_msg_reply_and_new(&skb, info); if (err) return err; goto send_done; } return genlmsg_reply(skb, info); nla_put_failure: err = -EMSGSIZE; err_resource_put: nlmsg_free(skb); return err; } int devlink_nl_resource_dump_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; if (list_empty(&devlink->resource_list)) return -EOPNOTSUPP; return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); } int devlink_resources_validate(struct devlink *devlink, struct devlink_resource *resource, struct genl_info *info) { struct list_head *resource_list; int err = 0; if (resource) resource_list = &resource->resource_list; else resource_list = &devlink->resource_list; list_for_each_entry(resource, resource_list, list) { if (!resource->size_valid) return -EINVAL; err = devlink_resources_validate(devlink, resource, info); if (err) return err; } return err; } /** * devl_resource_register - devlink resource register * * @devlink: devlink * @resource_name: resource's name * @resource_size: resource's size * @resource_id: resource's id * @parent_resource_id: resource's parent id * @size_params: size parameters * * Generic resources should reuse the same names across drivers. * Please see the generic resources list at: * Documentation/networking/devlink/devlink-resource.rst */ int devl_resource_register(struct devlink *devlink, const char *resource_name, u64 resource_size, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params) { struct devlink_resource *resource; struct list_head *resource_list; bool top_hierarchy; lockdep_assert_held(&devlink->lock); top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP; resource = devlink_resource_find(devlink, NULL, resource_id); if (resource) return -EEXIST; resource = kzalloc(sizeof(*resource), GFP_KERNEL); if (!resource) return -ENOMEM; if (top_hierarchy) { resource_list = &devlink->resource_list; } else { struct devlink_resource *parent_resource; parent_resource = devlink_resource_find(devlink, NULL, parent_resource_id); if (parent_resource) { resource_list = &parent_resource->resource_list; resource->parent = parent_resource; } else { kfree(resource); return -EINVAL; } } resource->name = resource_name; resource->size = resource_size; resource->size_new = resource_size; resource->id = resource_id; resource->size_valid = true; memcpy(&resource->size_params, size_params, sizeof(resource->size_params)); INIT_LIST_HEAD(&resource->resource_list); list_add_tail(&resource->list, resource_list); return 0; } EXPORT_SYMBOL_GPL(devl_resource_register); static void devlink_resource_unregister(struct devlink *devlink, struct devlink_resource *resource) { struct devlink_resource *tmp, *child_resource; list_for_each_entry_safe(child_resource, tmp, &resource->resource_list, list) { devlink_resource_unregister(devlink, child_resource); list_del(&child_resource->list); kfree(child_resource); } } /** * devl_resources_unregister - free all resources * * @devlink: devlink */ void devl_resources_unregister(struct devlink *devlink) { struct devlink_resource *tmp, *child_resource; lockdep_assert_held(&devlink->lock); list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list, list) { devlink_resource_unregister(devlink, child_resource); list_del(&child_resource->list); kfree(child_resource); } } EXPORT_SYMBOL_GPL(devl_resources_unregister); /** * devlink_resources_unregister - free all resources * * @devlink: devlink * * Context: Takes and release devlink->lock <mutex>. */ void devlink_resources_unregister(struct devlink *devlink) { devl_lock(devlink); devl_resources_unregister(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_resources_unregister); /** * devl_resource_size_get - get and update size * * @devlink: devlink * @resource_id: the requested resource id * @p_resource_size: ptr to update */ int devl_resource_size_get(struct devlink *devlink, u64 resource_id, u64 *p_resource_size) { struct devlink_resource *resource; lockdep_assert_held(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (!resource) return -EINVAL; *p_resource_size = resource->size_new; resource->size = resource->size_new; return 0; } EXPORT_SYMBOL_GPL(devl_resource_size_get); /** * devl_resource_occ_get_register - register occupancy getter * * @devlink: devlink * @resource_id: resource id * @occ_get: occupancy getter callback * @occ_get_priv: occupancy getter callback priv */ void devl_resource_occ_get_register(struct devlink *devlink, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv) { struct devlink_resource *resource; lockdep_assert_held(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (WARN_ON(!resource)) return; WARN_ON(resource->occ_get); resource->occ_get = occ_get; resource->occ_get_priv = occ_get_priv; } EXPORT_SYMBOL_GPL(devl_resource_occ_get_register); /** * devl_resource_occ_get_unregister - unregister occupancy getter * * @devlink: devlink * @resource_id: resource id */ void devl_resource_occ_get_unregister(struct devlink *devlink, u64 resource_id) { struct devlink_resource *resource; lockdep_assert_held(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (WARN_ON(!resource)) return; WARN_ON(!resource->occ_get); resource->occ_get = NULL; resource->occ_get_priv = NULL; } EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister);
5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 // SPDX-License-Identifier: GPL-2.0-only /* * CAIF Framing Layer. * * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/crc-ccitt.h> #include <linux/netdevice.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cffrml.h> #define container_obj(layr) container_of(layr, struct cffrml, layer) struct cffrml { struct cflayer layer; bool dofcs; /* !< FCS active */ int __percpu *pcpu_refcnt; }; static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt); static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); static u32 cffrml_rcv_error; static u32 cffrml_rcv_checsum_error; struct cflayer *cffrml_create(u16 phyid, bool use_fcs) { struct cffrml *this = kzalloc(sizeof(struct cffrml), GFP_ATOMIC); if (!this) return NULL; this->pcpu_refcnt = alloc_percpu(int); if (this->pcpu_refcnt == NULL) { kfree(this); return NULL; } caif_assert(offsetof(struct cffrml, layer) == 0); this->layer.receive = cffrml_receive; this->layer.transmit = cffrml_transmit; this->layer.ctrlcmd = cffrml_ctrlcmd; snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "frm%d", phyid); this->dofcs = use_fcs; this->layer.id = phyid; return (struct cflayer *) this; } void cffrml_free(struct cflayer *layer) { struct cffrml *this = container_obj(layer); free_percpu(this->pcpu_refcnt); kfree(layer); } void cffrml_set_uplayer(struct cflayer *this, struct cflayer *up) { this->up = up; } void cffrml_set_dnlayer(struct cflayer *this, struct cflayer *dn) { this->dn = dn; } static u16 cffrml_checksum(u16 chks, void *buf, u16 len) { /* FIXME: FCS should be moved to glue in order to use OS-Specific * solutions */ return crc_ccitt(chks, buf, len); } static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) { u16 tmp; u16 len; u16 hdrchks; int pktchks; struct cffrml *this; this = container_obj(layr); cfpkt_extr_head(pkt, &tmp, 2); len = le16_to_cpu(tmp); /* Subtract for FCS on length if FCS is not used. */ if (!this->dofcs) len -= 2; if (cfpkt_setlen(pkt, len) < 0) { ++cffrml_rcv_error; pr_err("Framing length error (%d)\n", len); cfpkt_destroy(pkt); return -EPROTO; } /* * Don't do extract if FCS is false, rather do setlen - then we don't * get a cache-miss. */ if (this->dofcs) { cfpkt_extr_trail(pkt, &tmp, 2); hdrchks = le16_to_cpu(tmp); pktchks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); if (pktchks != hdrchks) { cfpkt_add_trail(pkt, &tmp, 2); ++cffrml_rcv_error; ++cffrml_rcv_checsum_error; pr_info("Frame checksum error (0x%x != 0x%x)\n", hdrchks, pktchks); return -EILSEQ; } } if (cfpkt_erroneous(pkt)) { ++cffrml_rcv_error; pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } if (layr->up == NULL) { pr_err("Layr up is missing!\n"); cfpkt_destroy(pkt); return -EINVAL; } return layr->up->receive(layr->up, pkt); } static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt) { u16 chks; u16 len; __le16 data; struct cffrml *this = container_obj(layr); if (this->dofcs) { chks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); data = cpu_to_le16(chks); cfpkt_add_trail(pkt, &data, 2); } else { cfpkt_pad_trail(pkt, 2); } len = cfpkt_getlen(pkt); data = cpu_to_le16(len); cfpkt_add_head(pkt, &data, 2); cfpkt_info(pkt)->hdr_len += 2; if (cfpkt_erroneous(pkt)) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } if (layr->dn == NULL) { cfpkt_destroy(pkt); return -ENODEV; } return layr->dn->transmit(layr->dn, pkt); } static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { if (layr->up && layr->up->ctrlcmd) layr->up->ctrlcmd(layr->up, ctrl, layr->id); } void cffrml_put(struct cflayer *layr) { struct cffrml *this = container_obj(layr); if (layr != NULL && this->pcpu_refcnt != NULL) this_cpu_dec(*this->pcpu_refcnt); } void cffrml_hold(struct cflayer *layr) { struct cffrml *this = container_obj(layr); if (layr != NULL && this->pcpu_refcnt != NULL) this_cpu_inc(*this->pcpu_refcnt); } int cffrml_refcnt_read(struct cflayer *layr) { int i, refcnt = 0; struct cffrml *this = container_obj(layr); for_each_possible_cpu(i) refcnt += *per_cpu_ptr(this->pcpu_refcnt, i); return refcnt; }
309 1 306 255 5 111 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 /* SPDX-License-Identifier: GPL-2.0 */ /* * fs-verity: read-only file-based authenticity protection * * This header declares the interface between the fs/verity/ support layer and * filesystems that support fs-verity. * * Copyright 2019 Google LLC */ #ifndef _LINUX_FSVERITY_H #define _LINUX_FSVERITY_H #include <linux/fs.h> #include <linux/mm.h> #include <crypto/hash_info.h> #include <crypto/sha2.h> #include <uapi/linux/fsverity.h> /* * Largest digest size among all hash algorithms supported by fs-verity. * Currently assumed to be <= size of fsverity_descriptor::root_hash. */ #define FS_VERITY_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 /* Verity operations for filesystems */ struct fsverity_operations { /** * Begin enabling verity on the given file. * * @filp: a readonly file descriptor for the file * * The filesystem must do any needed filesystem-specific preparations * for enabling verity, e.g. evicting inline data. It also must return * -EBUSY if verity is already being enabled on the given file. * * i_rwsem is held for write. * * Return: 0 on success, -errno on failure */ int (*begin_enable_verity)(struct file *filp); /** * End enabling verity on the given file. * * @filp: a readonly file descriptor for the file * @desc: the verity descriptor to write, or NULL on failure * @desc_size: size of verity descriptor, or 0 on failure * @merkle_tree_size: total bytes the Merkle tree took up * * If desc == NULL, then enabling verity failed and the filesystem only * must do any necessary cleanups. Else, it must also store the given * verity descriptor to a fs-specific location associated with the inode * and do any fs-specific actions needed to mark the inode as a verity * inode, e.g. setting a bit in the on-disk inode. The filesystem is * also responsible for setting the S_VERITY flag in the VFS inode. * * i_rwsem is held for write, but it may have been dropped between * ->begin_enable_verity() and ->end_enable_verity(). * * Return: 0 on success, -errno on failure */ int (*end_enable_verity)(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size); /** * Get the verity descriptor of the given inode. * * @inode: an inode with the S_VERITY flag set * @buf: buffer in which to place the verity descriptor * @bufsize: size of @buf, or 0 to retrieve the size only * * If bufsize == 0, then the size of the verity descriptor is returned. * Otherwise the verity descriptor is written to 'buf' and its actual * size is returned; -ERANGE is returned if it's too large. This may be * called by multiple processes concurrently on the same inode. * * Return: the size on success, -errno on failure */ int (*get_verity_descriptor)(struct inode *inode, void *buf, size_t bufsize); /** * Read a Merkle tree page of the given inode. * * @inode: the inode * @index: 0-based index of the page within the Merkle tree * @num_ra_pages: The number of Merkle tree pages that should be * prefetched starting at @index if the page at @index * isn't already cached. Implementations may ignore this * argument; it's only a performance optimization. * * This can be called at any time on an open verity file. It may be * called by multiple processes concurrently, even with the same page. * * Note that this must retrieve a *page*, not necessarily a *block*. * * Return: the page on success, ERR_PTR() on failure */ struct page *(*read_merkle_tree_page)(struct inode *inode, pgoff_t index, unsigned long num_ra_pages); /** * Write a Merkle tree block to the given inode. * * @inode: the inode for which the Merkle tree is being built * @buf: the Merkle tree block to write * @pos: the position of the block in the Merkle tree (in bytes) * @size: the Merkle tree block size (in bytes) * * This is only called between ->begin_enable_verity() and * ->end_enable_verity(). * * Return: 0 on success, -errno on failure */ int (*write_merkle_tree_block)(struct inode *inode, const void *buf, u64 pos, unsigned int size); }; #ifdef CONFIG_FS_VERITY static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fsverity_set_info(). * I.e., another task may publish ->i_verity_info concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ return smp_load_acquire(&inode->i_verity_info); } /* enable.c */ int fsverity_ioctl_enable(struct file *filp, const void __user *arg); /* measure.c */ int fsverity_ioctl_measure(struct file *filp, void __user *arg); int fsverity_get_digest(struct inode *inode, u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], u8 *alg, enum hash_algo *halg); /* open.c */ int __fsverity_file_open(struct inode *inode, struct file *filp); int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); void __fsverity_cleanup_inode(struct inode *inode); /** * fsverity_cleanup_inode() - free the inode's verity info, if present * @inode: an inode being evicted * * Filesystems must call this on inode eviction to free ->i_verity_info. */ static inline void fsverity_cleanup_inode(struct inode *inode) { if (inode->i_verity_info) __fsverity_cleanup_inode(inode); } /* read_metadata.c */ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg); /* verify.c */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset); void fsverity_verify_bio(struct bio *bio); void fsverity_enqueue_verify_work(struct work_struct *work); #else /* !CONFIG_FS_VERITY */ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { return NULL; } /* enable.c */ static inline int fsverity_ioctl_enable(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } /* measure.c */ static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fsverity_get_digest(struct inode *inode, u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], u8 *alg, enum hash_algo *halg) { /* * fsverity is not enabled in the kernel configuration, so always report * that the file doesn't have fsverity enabled (digest size 0). */ return 0; } /* open.c */ static inline int __fsverity_file_open(struct inode *inode, struct file *filp) { return -EOPNOTSUPP; } static inline int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static inline void fsverity_cleanup_inode(struct inode *inode) { } /* read_metadata.c */ static inline int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) { return -EOPNOTSUPP; } /* verify.c */ static inline bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { WARN_ON_ONCE(1); return false; } static inline void fsverity_verify_bio(struct bio *bio) { WARN_ON_ONCE(1); } static inline void fsverity_enqueue_verify_work(struct work_struct *work) { WARN_ON_ONCE(1); } #endif /* !CONFIG_FS_VERITY */ static inline bool fsverity_verify_folio(struct folio *folio) { return fsverity_verify_blocks(folio, folio_size(folio), 0); } static inline bool fsverity_verify_page(struct page *page) { return fsverity_verify_blocks(page_folio(page), PAGE_SIZE, 0); } /** * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * * This checks whether ->i_verity_info has been set. * * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) * * Return: true if reads need to go through fs-verity, otherwise false */ static inline bool fsverity_active(const struct inode *inode) { return fsverity_get_info(inode) != NULL; } /** * fsverity_file_open() - prepare to open a verity file * @inode: the inode being opened * @filp: the struct file being set up * * When opening a verity file, deny the open if it is for writing. Otherwise, * set up the inode's ->i_verity_info if not already done. * * When combined with fscrypt, this must be called after fscrypt_file_open(). * Otherwise, we won't have the key set up to decrypt the verity metadata. * * Return: 0 on success, -errno on failure */ static inline int fsverity_file_open(struct inode *inode, struct file *filp) { if (IS_VERITY(inode)) return __fsverity_file_open(inode, filp); return 0; } /** * fsverity_prepare_setattr() - prepare to change a verity inode's attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * * Verity files are immutable, so deny truncates. This isn't covered by the * open-time check because sys_truncate() takes a path, not a file descriptor. * * Return: 0 on success, -errno on failure */ static inline int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (IS_VERITY(d_inode(dentry))) return __fsverity_prepare_setattr(dentry, attr); return 0; } #endif /* _LINUX_FSVERITY_H */
2 2 2 2 2 1 1 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/xprt.c * * This is a generic RPC call interface supporting congestion avoidance, * and asynchronous calls. * * The interface works like this: * * - When a process places a call, it allocates a request slot if * one is available. Otherwise, it sleeps on the backlog queue * (xprt_reserve). * - Next, the caller puts together the RPC message, stuffs it into * the request struct, and calls xprt_transmit(). * - xprt_transmit sends the message and installs the caller on the * transport's wait list. At the same time, if a reply is expected, * it installs a timer that is run after the packet's timeout has * expired. * - When a packet arrives, the data_ready handler walks the list of * pending requests for that transport. If a matching XID is found, the * caller is woken up, and the timer removed. * - When no reply arrives within the timeout interval, the timer is * fired by the kernel and runs xprt_timer(). It either adjusts the * timeout values (minor timeout) or wakes up the caller with a status * of -ETIMEDOUT. * - When the caller receives a notification from RPC that a reply arrived, * it should release the RPC slot, and process the reply. * If the call timed out, it may choose to retry the operation by * adjusting the initial timeout value, and simply calling rpc_call * again. * * Support for async RPC is done through a set of RPC-specific scheduling * primitives that `transparently' work for processes as well as async * tasks that rely on callbacks. * * Copyright (C) 1995-1997, Olaf Kirch <okir@monad.swb.de> * * Transport switch API copyright (C) 2005, Chuck Lever <cel@netapp.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/workqueue.h> #include <linux/net.h> #include <linux/ktime.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/rcupdate.h> #include <linux/sched/mm.h> #include <trace/events/sunrpc.h> #include "sunrpc.h" #include "sysfs.h" #include "fail.h" /* * Local variables */ #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_XPRT #endif /* * Local functions */ static void xprt_init(struct rpc_xprt *xprt, struct net *net); static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); static void xprt_destroy(struct rpc_xprt *xprt); static void xprt_request_init(struct rpc_task *task); static int xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf); static DEFINE_SPINLOCK(xprt_list_lock); static LIST_HEAD(xprt_list); static unsigned long xprt_request_timeout(const struct rpc_rqst *req) { unsigned long timeout = jiffies + req->rq_timeout; if (time_before(timeout, req->rq_majortimeo)) return timeout; return req->rq_majortimeo; } /** * xprt_register_transport - register a transport implementation * @transport: transport to register * * If a transport implementation is loaded as a kernel module, it can * call this interface to make itself known to the RPC client. * * Returns: * 0: transport successfully registered * -EEXIST: transport already registered * -EINVAL: transport module being unloaded */ int xprt_register_transport(struct xprt_class *transport) { struct xprt_class *t; int result; result = -EEXIST; spin_lock(&xprt_list_lock); list_for_each_entry(t, &xprt_list, list) { /* don't register the same transport class twice */ if (t->ident == transport->ident) goto out; } list_add_tail(&transport->list, &xprt_list); printk(KERN_INFO "RPC: Registered %s transport module.\n", transport->name); result = 0; out: spin_unlock(&xprt_list_lock); return result; } EXPORT_SYMBOL_GPL(xprt_register_transport); /** * xprt_unregister_transport - unregister a transport implementation * @transport: transport to unregister * * Returns: * 0: transport successfully unregistered * -ENOENT: transport never registered */ int xprt_unregister_transport(struct xprt_class *transport) { struct xprt_class *t; int result; result = 0; spin_lock(&xprt_list_lock); list_for_each_entry(t, &xprt_list, list) { if (t == transport) { printk(KERN_INFO "RPC: Unregistered %s transport module.\n", transport->name); list_del_init(&transport->list); goto out; } } result = -ENOENT; out: spin_unlock(&xprt_list_lock); return result; } EXPORT_SYMBOL_GPL(xprt_unregister_transport); static void xprt_class_release(const struct xprt_class *t) { module_put(t->owner); } static const struct xprt_class * xprt_class_find_by_ident_locked(int ident) { const struct xprt_class *t; list_for_each_entry(t, &xprt_list, list) { if (t->ident != ident) continue; if (!try_module_get(t->owner)) continue; return t; } return NULL; } static const struct xprt_class * xprt_class_find_by_ident(int ident) { const struct xprt_class *t; spin_lock(&xprt_list_lock); t = xprt_class_find_by_ident_locked(ident); spin_unlock(&xprt_list_lock); return t; } static const struct xprt_class * xprt_class_find_by_netid_locked(const char *netid) { const struct xprt_class *t; unsigned int i; list_for_each_entry(t, &xprt_list, list) { for (i = 0; t->netid[i][0] != '\0'; i++) { if (strcmp(t->netid[i], netid) != 0) continue; if (!try_module_get(t->owner)) continue; return t; } } return NULL; } static const struct xprt_class * xprt_class_find_by_netid(const char *netid) { const struct xprt_class *t; spin_lock(&xprt_list_lock); t = xprt_class_find_by_netid_locked(netid); if (!t) { spin_unlock(&xprt_list_lock); request_module("rpc%s", netid); spin_lock(&xprt_list_lock); t = xprt_class_find_by_netid_locked(netid); } spin_unlock(&xprt_list_lock); return t; } /** * xprt_find_transport_ident - convert a netid into a transport identifier * @netid: transport to load * * Returns: * > 0: transport identifier * -ENOENT: transport module not available */ int xprt_find_transport_ident(const char *netid) { const struct xprt_class *t; int ret; t = xprt_class_find_by_netid(netid); if (!t) return -ENOENT; ret = t->ident; xprt_class_release(t); return ret; } EXPORT_SYMBOL_GPL(xprt_find_transport_ident); static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) clear_bit_unlock(XPRT_LOCKED, &xprt->state); else queue_work(xprtiod_workqueue, &xprt->task_cleanup); } /** * xprt_reserve_xprt - serialize write access to transports * @task: task that is requesting access to the transport * @xprt: pointer to the target transport * * This prevents mixing the payload of separate requests, and prevents * transport connects from colliding with writes. No congestion control * is provided. */ int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) goto out_locked; goto out_sleep; } if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; xprt->snd_task = task; out_locked: trace_xprt_reserve_xprt(xprt, task); return 1; out_unlock: xprt_clear_locked(xprt); out_sleep: task->tk_status = -EAGAIN; if (RPC_IS_SOFT(task) || RPC_IS_SOFTCONN(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, xprt_request_timeout(req)); else rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt); static bool xprt_need_congestion_window_wait(struct rpc_xprt *xprt) { return test_bit(XPRT_CWND_WAIT, &xprt->state); } static void xprt_set_congestion_window_wait(struct rpc_xprt *xprt) { if (!list_empty(&xprt->xmit_queue)) { /* Peek at head of queue to see if it can make progress */ if (list_first_entry(&xprt->xmit_queue, struct rpc_rqst, rq_xmit)->rq_cong) return; } set_bit(XPRT_CWND_WAIT, &xprt->state); } static void xprt_test_and_clear_congestion_window_wait(struct rpc_xprt *xprt) { if (!RPCXPRT_CONGESTED(xprt)) clear_bit(XPRT_CWND_WAIT, &xprt->state); } /* * xprt_reserve_xprt_cong - serialize write access to transports * @task: task that is requesting access to the transport * * Same as xprt_reserve_xprt, but Van Jacobson congestion control is * integrated into the decision of whether a request is allowed to be * woken up and given access to the transport. * Note that the lock is only granted if we know there are free slots. */ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) goto out_locked; goto out_sleep; } if (req == NULL) { xprt->snd_task = task; goto out_locked; } if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; if (!xprt_need_congestion_window_wait(xprt)) { xprt->snd_task = task; goto out_locked; } out_unlock: xprt_clear_locked(xprt); out_sleep: task->tk_status = -EAGAIN; if (RPC_IS_SOFT(task) || RPC_IS_SOFTCONN(task)) rpc_sleep_on_timeout(&xprt->sending, task, NULL, xprt_request_timeout(req)); else rpc_sleep_on(&xprt->sending, task, NULL); return 0; out_locked: trace_xprt_reserve_cong(xprt, task); return 1; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; if (test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == task) return 1; spin_lock(&xprt->transport_lock); retval = xprt->ops->reserve_xprt(xprt, task); spin_unlock(&xprt->transport_lock); return retval; } static bool __xprt_lock_write_func(struct rpc_task *task, void *data) { struct rpc_xprt *xprt = data; xprt->snd_task = task; return true; } static void __xprt_lock_write_next(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, __xprt_lock_write_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); } static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) goto out_unlock; if (xprt_need_congestion_window_wait(xprt)) goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, __xprt_lock_write_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); } /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting * @task: task that is releasing access to the transport * * Note that "task" can be NULL. No congestion control is provided. */ void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } trace_xprt_release_xprt(xprt, task); } EXPORT_SYMBOL_GPL(xprt_release_xprt); /** * xprt_release_xprt_cong - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting * @task: task that is releasing access to the transport * * Note that "task" can be NULL. Another task is awoken to use the * transport if the transport's congestion window allows it. */ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } trace_xprt_release_cong(xprt, task); } EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task != task) return; spin_lock(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); spin_unlock(&xprt->transport_lock); } /* * Van Jacobson congestion avoidance. Check if the congestion window * overflowed. Put the task to sleep if this is the case. */ static int __xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { if (req->rq_cong) return 1; trace_xprt_get_cong(xprt, req->rq_task); if (RPCXPRT_CONGESTED(xprt)) { xprt_set_congestion_window_wait(xprt); return 0; } req->rq_cong = 1; xprt->cong += RPC_CWNDSCALE; return 1; } /* * Adjust the congestion window, and wake up the next task * that has been sleeping due to congestion */ static void __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { if (!req->rq_cong) return; req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; xprt_test_and_clear_congestion_window_wait(xprt); trace_xprt_put_cong(xprt, req->rq_task); __xprt_lock_write_next_cong(xprt); } /** * xprt_request_get_cong - Request congestion control credits * @xprt: pointer to transport * @req: pointer to RPC request * * Useful for transports that require congestion control. */ bool xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { bool ret = false; if (req->rq_cong) return true; spin_lock(&xprt->transport_lock); ret = __xprt_get_cong(xprt, req) != 0; spin_unlock(&xprt->transport_lock); return ret; } EXPORT_SYMBOL_GPL(xprt_request_get_cong); /** * xprt_release_rqst_cong - housekeeping when request is complete * @task: RPC request that recently completed * * Useful for transports that require congestion control. */ void xprt_release_rqst_cong(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; __xprt_put_cong(req->rq_xprt, req); } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); static void xprt_clear_congestion_window_wait_locked(struct rpc_xprt *xprt) { if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) __xprt_lock_write_next_cong(xprt); } /* * Clear the congestion window wait flag and wake up the next * entry on xprt->sending */ static void xprt_clear_congestion_window_wait(struct rpc_xprt *xprt) { if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) { spin_lock(&xprt->transport_lock); __xprt_lock_write_next_cong(xprt); spin_unlock(&xprt->transport_lock); } } /** * xprt_adjust_cwnd - adjust transport congestion window * @xprt: pointer to xprt * @task: recently completed RPC request used to adjust window * @result: result code of completed RPC request * * The transport code maintains an estimate on the maximum number of out- * standing RPC requests, using a smoothed version of the congestion * avoidance implemented in 44BSD. This is basically the Van Jacobson * congestion algorithm: If a retransmit occurs, the congestion window is * halved; otherwise, it is incremented by 1/cwnd when * * - a reply is received and * - a full number of requests are outstanding and * - the congestion window hasn't been updated recently. */ void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result) { struct rpc_rqst *req = task->tk_rqstp; unsigned long cwnd = xprt->cwnd; if (result >= 0 && cwnd <= xprt->cong) { /* The (cwnd >> 1) term makes sure * the result gets rounded properly. */ cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd; if (cwnd > RPC_MAXCWND(xprt)) cwnd = RPC_MAXCWND(xprt); __xprt_lock_write_next_cong(xprt); } else if (result == -ETIMEDOUT) { cwnd >>= 1; if (cwnd < RPC_CWNDSCALE) cwnd = RPC_CWNDSCALE; } dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n", xprt->cong, xprt->cwnd, cwnd); xprt->cwnd = cwnd; __xprt_put_cong(xprt, req); } EXPORT_SYMBOL_GPL(xprt_adjust_cwnd); /** * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue * @xprt: transport with waiting tasks * @status: result code to plant in each task before waking it * */ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) { if (status < 0) rpc_wake_up_status(&xprt->pending, status); else rpc_wake_up(&xprt->pending); } EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); /** * xprt_wait_for_buffer_space - wait for transport output buffer to clear * @xprt: transport * * Note that we only set the timer for the case of RPC_IS_SOFT(), since * we don't in general want to force a socket disconnection due to * an incomplete RPC call transmission. */ void xprt_wait_for_buffer_space(struct rpc_xprt *xprt) { set_bit(XPRT_WRITE_SPACE, &xprt->state); } EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); static bool xprt_clear_write_space_locked(struct rpc_xprt *xprt) { if (test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state)) { __xprt_lock_write_next(xprt); dprintk("RPC: write space: waking waiting task on " "xprt %p\n", xprt); return true; } return false; } /** * xprt_write_space - wake the task waiting for transport output buffer space * @xprt: transport with waiting tasks * * Can be called in a soft IRQ context, so xprt_write_space never sleeps. */ bool xprt_write_space(struct rpc_xprt *xprt) { bool ret; if (!test_bit(XPRT_WRITE_SPACE, &xprt->state)) return false; spin_lock(&xprt->transport_lock); ret = xprt_clear_write_space_locked(xprt); spin_unlock(&xprt->transport_lock); return ret; } EXPORT_SYMBOL_GPL(xprt_write_space); static unsigned long xprt_abs_ktime_to_jiffies(ktime_t abstime) { s64 delta = ktime_to_ns(ktime_get() - abstime); return likely(delta >= 0) ? jiffies - nsecs_to_jiffies(delta) : jiffies + nsecs_to_jiffies(-delta); } static unsigned long xprt_calc_majortimeo(struct rpc_rqst *req, const struct rpc_timeout *to) { unsigned long majortimeo = req->rq_timeout; if (to->to_exponential) majortimeo <<= to->to_retries; else majortimeo += to->to_increment * to->to_retries; if (majortimeo > to->to_maxval || majortimeo == 0) majortimeo = to->to_maxval; return majortimeo; } static void xprt_reset_majortimeo(struct rpc_rqst *req, const struct rpc_timeout *to) { req->rq_majortimeo += xprt_calc_majortimeo(req, to); } static void xprt_reset_minortimeo(struct rpc_rqst *req) { req->rq_minortimeo += req->rq_timeout; } static void xprt_init_majortimeo(struct rpc_task *task, struct rpc_rqst *req, const struct rpc_timeout *to) { unsigned long time_init; struct rpc_xprt *xprt = req->rq_xprt; if (likely(xprt && xprt_connected(xprt))) time_init = jiffies; else time_init = xprt_abs_ktime_to_jiffies(task->tk_start); req->rq_timeout = to->to_initval; req->rq_majortimeo = time_init + xprt_calc_majortimeo(req, to); req->rq_minortimeo = time_init + req->rq_timeout; } /** * xprt_adjust_timeout - adjust timeout values for next retransmit * @req: RPC request containing parameters to use for the adjustment * */ int xprt_adjust_timeout(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; int status = 0; if (time_before(jiffies, req->rq_majortimeo)) { if (time_before(jiffies, req->rq_minortimeo)) return status; if (to->to_exponential) req->rq_timeout <<= 1; else req->rq_timeout += to->to_increment; if (to->to_maxval && req->rq_timeout >= to->to_maxval) req->rq_timeout = to->to_maxval; req->rq_retries++; } else { req->rq_timeout = to->to_initval; req->rq_retries = 0; xprt_reset_majortimeo(req, to); /* Reset the RTT counters == "slow start" */ spin_lock(&xprt->transport_lock); rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval); spin_unlock(&xprt->transport_lock); status = -ETIMEDOUT; } xprt_reset_minortimeo(req); if (req->rq_timeout == 0) { printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n"); req->rq_timeout = 5 * HZ; } return status; } static void xprt_autoclose(struct work_struct *work) { struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); unsigned int pflags = memalloc_nofs_save(); trace_xprt_disconnect_auto(xprt); xprt->connect_cookie++; smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); xprt->ops->close(xprt); xprt_release_write(xprt, NULL); wake_up_bit(&xprt->state, XPRT_LOCKED); memalloc_nofs_restore(pflags); } /** * xprt_disconnect_done - mark a transport as disconnected * @xprt: transport to flag for disconnect * */ void xprt_disconnect_done(struct rpc_xprt *xprt) { trace_xprt_disconnect_done(xprt); spin_lock(&xprt->transport_lock); xprt_clear_connected(xprt); xprt_clear_write_space_locked(xprt); xprt_clear_congestion_window_wait_locked(xprt); xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_disconnect_done); /** * xprt_schedule_autoclose_locked - Try to schedule an autoclose RPC call * @xprt: transport to disconnect */ static void xprt_schedule_autoclose_locked(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_CLOSE_WAIT, &xprt->state)) return; if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(xprtiod_workqueue, &xprt->task_cleanup); else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) rpc_wake_up_queued_task_set_status(&xprt->pending, xprt->snd_task, -ENOTCONN); } /** * xprt_force_disconnect - force a transport to disconnect * @xprt: transport to disconnect * */ void xprt_force_disconnect(struct rpc_xprt *xprt) { trace_xprt_disconnect_force(xprt); /* Don't race with the test_bit() in xprt_clear_locked() */ spin_lock(&xprt->transport_lock); xprt_schedule_autoclose_locked(xprt); spin_unlock(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_force_disconnect); static unsigned int xprt_connect_cookie(struct rpc_xprt *xprt) { return READ_ONCE(xprt->connect_cookie); } static bool xprt_request_retransmit_after_disconnect(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; return req->rq_connect_cookie != xprt_connect_cookie(xprt) || !xprt_connected(xprt); } /** * xprt_conditional_disconnect - force a transport to disconnect * @xprt: transport to disconnect * @cookie: 'connection cookie' * * This attempts to break the connection if and only if 'cookie' matches * the current transport 'connection cookie'. It ensures that we don't * try to break the connection more than once when we need to retransmit * a batch of RPC requests. * */ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) { /* Don't race with the test_bit() in xprt_clear_locked() */ spin_lock(&xprt->transport_lock); if (cookie != xprt->connect_cookie) goto out; if (test_bit(XPRT_CLOSING, &xprt->state)) goto out; xprt_schedule_autoclose_locked(xprt); out: spin_unlock(&xprt->transport_lock); } static bool xprt_has_timer(const struct rpc_xprt *xprt) { return xprt->idle_timeout != 0; } static void xprt_schedule_autodisconnect(struct rpc_xprt *xprt) __must_hold(&xprt->transport_lock) { xprt->last_used = jiffies; if (RB_EMPTY_ROOT(&xprt->recv_queue) && xprt_has_timer(xprt)) mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); } static void xprt_init_autodisconnect(struct timer_list *t) { struct rpc_xprt *xprt = from_timer(xprt, t, timer); if (!RB_EMPTY_ROOT(&xprt->recv_queue)) return; /* Reset xprt->last_used to avoid connect/autodisconnect cycling */ xprt->last_used = jiffies; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; queue_work(xprtiod_workqueue, &xprt->task_cleanup); } #if IS_ENABLED(CONFIG_FAIL_SUNRPC) static void xprt_inject_disconnect(struct rpc_xprt *xprt) { if (!fail_sunrpc.ignore_client_disconnect && should_fail(&fail_sunrpc.attr, 1)) xprt->ops->inject_disconnect(xprt); } #else static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) { } #endif bool xprt_lock_connect(struct rpc_xprt *xprt, struct rpc_task *task, void *cookie) { bool ret = false; spin_lock(&xprt->transport_lock); if (!test_bit(XPRT_LOCKED, &xprt->state)) goto out; if (xprt->snd_task != task) goto out; set_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->snd_task = cookie; ret = true; out: spin_unlock(&xprt->transport_lock); return ret; } EXPORT_SYMBOL_GPL(xprt_lock_connect); void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) { spin_lock(&xprt->transport_lock); if (xprt->snd_task != cookie) goto out; if (!test_bit(XPRT_LOCKED, &xprt->state)) goto out; xprt->snd_task =NULL; clear_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->ops->release_xprt(xprt, NULL); xprt_schedule_autodisconnect(xprt); out: spin_unlock(&xprt->transport_lock); wake_up_bit(&xprt->state, XPRT_LOCKED); } EXPORT_SYMBOL_GPL(xprt_unlock_connect); /** * xprt_connect - schedule a transport connect operation * @task: RPC task that is requesting the connect * */ void xprt_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; trace_xprt_connect(xprt); if (!xprt_bound(xprt)) { task->tk_status = -EAGAIN; return; } if (!xprt_lock_write(xprt, task)) return; if (!xprt_connected(xprt) && !test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; rpc_sleep_on_timeout(&xprt->pending, task, NULL, xprt_request_timeout(task->tk_rqstp)); if (test_bit(XPRT_CLOSING, &xprt->state)) return; if (xprt_test_and_set_connecting(xprt)) return; /* Race breaker */ if (!xprt_connected(xprt)) { xprt->stat.connect_start = jiffies; xprt->ops->connect(xprt, task); } else { xprt_clear_connecting(xprt); task->tk_status = 0; rpc_wake_up_queued_task(&xprt->pending, task); } } xprt_release_write(xprt, task); } /** * xprt_reconnect_delay - compute the wait before scheduling a connect * @xprt: transport instance * */ unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt) { unsigned long start, now = jiffies; start = xprt->stat.connect_start + xprt->reestablish_timeout; if (time_after(start, now)) return start - now; return 0; } EXPORT_SYMBOL_GPL(xprt_reconnect_delay); /** * xprt_reconnect_backoff - compute the new re-establish timeout * @xprt: transport instance * @init_to: initial reestablish timeout * */ void xprt_reconnect_backoff(struct rpc_xprt *xprt, unsigned long init_to) { xprt->reestablish_timeout <<= 1; if (xprt->reestablish_timeout > xprt->max_reconnect_timeout) xprt->reestablish_timeout = xprt->max_reconnect_timeout; if (xprt->reestablish_timeout < init_to) xprt->reestablish_timeout = init_to; } EXPORT_SYMBOL_GPL(xprt_reconnect_backoff); enum xprt_xid_rb_cmp { XID_RB_EQUAL, XID_RB_LEFT, XID_RB_RIGHT, }; static enum xprt_xid_rb_cmp xprt_xid_cmp(__be32 xid1, __be32 xid2) { if (xid1 == xid2) return XID_RB_EQUAL; if ((__force u32)xid1 < (__force u32)xid2) return XID_RB_LEFT; return XID_RB_RIGHT; } static struct rpc_rqst * xprt_request_rb_find(struct rpc_xprt *xprt, __be32 xid) { struct rb_node *n = xprt->recv_queue.rb_node; struct rpc_rqst *req; while (n != NULL) { req = rb_entry(n, struct rpc_rqst, rq_recv); switch (xprt_xid_cmp(xid, req->rq_xid)) { case XID_RB_LEFT: n = n->rb_left; break; case XID_RB_RIGHT: n = n->rb_right; break; case XID_RB_EQUAL: return req; } } return NULL; } static void xprt_request_rb_insert(struct rpc_xprt *xprt, struct rpc_rqst *new) { struct rb_node **p = &xprt->recv_queue.rb_node; struct rb_node *n = NULL; struct rpc_rqst *req; while (*p != NULL) { n = *p; req = rb_entry(n, struct rpc_rqst, rq_recv); switch(xprt_xid_cmp(new->rq_xid, req->rq_xid)) { case XID_RB_LEFT: p = &n->rb_left; break; case XID_RB_RIGHT: p = &n->rb_right; break; case XID_RB_EQUAL: WARN_ON_ONCE(new != req); return; } } rb_link_node(&new->rq_recv, n, p); rb_insert_color(&new->rq_recv, &xprt->recv_queue); } static void xprt_request_rb_remove(struct rpc_xprt *xprt, struct rpc_rqst *req) { rb_erase(&req->rq_recv, &xprt->recv_queue); } /** * xprt_lookup_rqst - find an RPC request corresponding to an XID * @xprt: transport on which the original request was transmitted * @xid: RPC XID of incoming reply * * Caller holds xprt->queue_lock. */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) { struct rpc_rqst *entry; entry = xprt_request_rb_find(xprt, xid); if (entry != NULL) { trace_xprt_lookup_rqst(xprt, xid, 0); entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); return entry; } dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n", ntohl(xid)); trace_xprt_lookup_rqst(xprt, xid, -ENOENT); xprt->stat.bad_xids++; return NULL; } EXPORT_SYMBOL_GPL(xprt_lookup_rqst); static bool xprt_is_pinned_rqst(struct rpc_rqst *req) { return atomic_read(&req->rq_pin) != 0; } /** * xprt_pin_rqst - Pin a request on the transport receive list * @req: Request to pin * * Caller must ensure this is atomic with the call to xprt_lookup_rqst() * so should be holding xprt->queue_lock. */ void xprt_pin_rqst(struct rpc_rqst *req) { atomic_inc(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_pin_rqst); /** * xprt_unpin_rqst - Unpin a request on the transport receive list * @req: Request to pin * * Caller should be holding xprt->queue_lock. */ void xprt_unpin_rqst(struct rpc_rqst *req) { if (!test_bit(RPC_TASK_MSG_PIN_WAIT, &req->rq_task->tk_runstate)) { atomic_dec(&req->rq_pin); return; } if (atomic_dec_and_test(&req->rq_pin)) wake_up_var(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_unpin_rqst); static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) { wait_var_event(&req->rq_pin, !xprt_is_pinned_rqst(req)); } static bool xprt_request_data_received(struct rpc_task *task) { return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) != 0; } static bool xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req) { return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) == 0; } /** * xprt_request_enqueue_receive - Add an request to the receive queue * @task: RPC task * */ int xprt_request_enqueue_receive(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; int ret; if (!xprt_request_need_enqueue_receive(task, req)) return 0; ret = xprt_request_prepare(task->tk_rqstp, &req->rq_rcv_buf); if (ret) return ret; spin_lock(&xprt->queue_lock); /* Update the softirq receive buffer */ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); /* Add request to the receive list */ xprt_request_rb_insert(xprt, req); set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate); spin_unlock(&xprt->queue_lock); /* Turn off autodisconnect */ del_timer_sync(&xprt->timer); return 0; } /** * xprt_request_dequeue_receive_locked - Remove a request from the receive queue * @task: RPC task * * Caller must hold xprt->queue_lock. */ static void xprt_request_dequeue_receive_locked(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) xprt_request_rb_remove(req->rq_xprt, req); } /** * xprt_update_rtt - Update RPC RTT statistics * @task: RPC request that recently completed * * Caller holds xprt->queue_lock. */ void xprt_update_rtt(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_rtt *rtt = task->tk_client->cl_rtt; unsigned int timer = task->tk_msg.rpc_proc->p_timer; long m = usecs_to_jiffies(ktime_to_us(req->rq_rtt)); if (timer) { if (req->rq_ntrans == 1) rpc_update_rtt(rtt, timer, m); rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); } } EXPORT_SYMBOL_GPL(xprt_update_rtt); /** * xprt_complete_rqst - called when reply processing is complete * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * * Caller holds xprt->queue_lock. */ void xprt_complete_rqst(struct rpc_task *task, int copied) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; xprt->stat.recvs++; xdr_free_bvec(&req->rq_rcv_buf); req->rq_private_buf.bvec = NULL; req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ smp_wmb(); req->rq_reply_bytes_recvd = copied; xprt_request_dequeue_receive_locked(task); rpc_wake_up_queued_task(&xprt->pending, task); } EXPORT_SYMBOL_GPL(xprt_complete_rqst); static void xprt_timer(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; if (task->tk_status != -ETIMEDOUT) return; trace_xprt_timer(xprt, req->rq_xid, task->tk_status); if (!req->rq_reply_bytes_recvd) { if (xprt->ops->timer) xprt->ops->timer(xprt, task); } else task->tk_status = 0; } /** * xprt_wait_for_reply_request_def - wait for reply * @task: pointer to rpc_task * * Set a request's retransmit timeout based on the transport's * default timeout parameters. Used by transports that don't adjust * the retransmit timeout based on round-trip time estimation, * and put the task to sleep on the pending queue. */ void xprt_wait_for_reply_request_def(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer, xprt_request_timeout(req)); } EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_def); /** * xprt_wait_for_reply_request_rtt - wait for reply using RTT estimator * @task: pointer to rpc_task * * Set a request's retransmit timeout using the RTT estimator, * and put the task to sleep on the pending queue. */ void xprt_wait_for_reply_request_rtt(struct rpc_task *task) { int timer = task->tk_msg.rpc_proc->p_timer; struct rpc_clnt *clnt = task->tk_client; struct rpc_rtt *rtt = clnt->cl_rtt; struct rpc_rqst *req = task->tk_rqstp; unsigned long max_timeout = clnt->cl_timeout->to_maxval; unsigned long timeout; timeout = rpc_calc_rto(rtt, timer); timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; if (timeout > max_timeout || timeout == 0) timeout = max_timeout; rpc_sleep_on_timeout(&req->rq_xprt->pending, task, xprt_timer, jiffies + timeout); } EXPORT_SYMBOL_GPL(xprt_wait_for_reply_request_rtt); /** * xprt_request_wait_receive - wait for the reply to an RPC request * @task: RPC task about to send a request * */ void xprt_request_wait_receive(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; if (!test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) return; /* * Sleep on the pending queue if we're expecting a reply. * The spinlock ensures atomicity between the test of * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). */ spin_lock(&xprt->queue_lock); if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { xprt->ops->wait_for_reply_request(task); /* * Send an extra queue wakeup call if the * connection was dropped in case the call to * rpc_sleep_on() raced. */ if (xprt_request_retransmit_after_disconnect(task)) rpc_wake_up_queued_task_set_status(&xprt->pending, task, -ENOTCONN); } spin_unlock(&xprt->queue_lock); } static bool xprt_request_need_enqueue_transmit(struct rpc_task *task, struct rpc_rqst *req) { return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); } /** * xprt_request_enqueue_transmit - queue a task for transmission * @task: pointer to rpc_task * * Add a task to the transmission queue. */ void xprt_request_enqueue_transmit(struct rpc_task *task) { struct rpc_rqst *pos, *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; int ret; if (xprt_request_need_enqueue_transmit(task, req)) { ret = xprt_request_prepare(task->tk_rqstp, &req->rq_snd_buf); if (ret) { task->tk_status = ret; return; } req->rq_bytes_sent = 0; spin_lock(&xprt->queue_lock); /* * Requests that carry congestion control credits are added * to the head of the list to avoid starvation issues. */ if (req->rq_cong) { xprt_clear_congestion_window_wait(xprt); list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_cong) continue; /* Note: req is added _before_ pos */ list_add_tail(&req->rq_xmit, &pos->rq_xmit); INIT_LIST_HEAD(&req->rq_xmit2); goto out; } } else if (!req->rq_seqno) { list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_task->tk_owner != task->tk_owner) continue; list_add_tail(&req->rq_xmit2, &pos->rq_xmit2); INIT_LIST_HEAD(&req->rq_xmit); goto out; } } list_add_tail(&req->rq_xmit, &xprt->xmit_queue); INIT_LIST_HEAD(&req->rq_xmit2); out: atomic_long_inc(&xprt->xmit_queuelen); set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); spin_unlock(&xprt->queue_lock); } } /** * xprt_request_dequeue_transmit_locked - remove a task from the transmission queue * @task: pointer to rpc_task * * Remove a task from the transmission queue * Caller must hold xprt->queue_lock */ static void xprt_request_dequeue_transmit_locked(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) return; if (!list_empty(&req->rq_xmit)) { struct rpc_xprt *xprt = req->rq_xprt; if (list_is_first(&req->rq_xmit, &xprt->xmit_queue) && xprt->ops->abort_send_request) xprt->ops->abort_send_request(req); list_del(&req->rq_xmit); if (!list_empty(&req->rq_xmit2)) { struct rpc_rqst *next = list_first_entry(&req->rq_xmit2, struct rpc_rqst, rq_xmit2); list_del(&req->rq_xmit2); list_add_tail(&next->rq_xmit, &next->rq_xprt->xmit_queue); } } else list_del(&req->rq_xmit2); atomic_long_dec(&req->rq_xprt->xmit_queuelen); xdr_free_bvec(&req->rq_snd_buf); } /** * xprt_request_dequeue_transmit - remove a task from the transmission queue * @task: pointer to rpc_task * * Remove a task from the transmission queue */ static void xprt_request_dequeue_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; spin_lock(&xprt->queue_lock); xprt_request_dequeue_transmit_locked(task); spin_unlock(&xprt->queue_lock); } /** * xprt_request_dequeue_xprt - remove a task from the transmit+receive queue * @task: pointer to rpc_task * * Remove a task from the transmit and receive queues, and ensure that * it is not pinned by the receive work item. */ void xprt_request_dequeue_xprt(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || xprt_is_pinned_rqst(req)) { spin_lock(&xprt->queue_lock); while (xprt_is_pinned_rqst(req)) { set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); spin_unlock(&xprt->queue_lock); xprt_wait_on_pinned_rqst(req); spin_lock(&xprt->queue_lock); clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); } xprt_request_dequeue_transmit_locked(task); xprt_request_dequeue_receive_locked(task); spin_unlock(&xprt->queue_lock); xdr_free_bvec(&req->rq_rcv_buf); } } /** * xprt_request_prepare - prepare an encoded request for transport * @req: pointer to rpc_rqst * @buf: pointer to send/rcv xdr_buf * * Calls into the transport layer to do whatever is needed to prepare * the request for transmission or receive. * Returns error, or zero. */ static int xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf) { struct rpc_xprt *xprt = req->rq_xprt; if (xprt->ops->prepare_request) return xprt->ops->prepare_request(req, buf); return 0; } /** * xprt_request_need_retransmit - Test if a task needs retransmission * @task: pointer to rpc_task * * Test for whether a connection breakage requires the task to retransmit */ bool xprt_request_need_retransmit(struct rpc_task *task) { return xprt_request_retransmit_after_disconnect(task); } /** * xprt_prepare_transmit - reserve the transport before sending a request * @task: RPC task about to send a request * */ bool xprt_prepare_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; if (!xprt_lock_write(xprt, task)) { /* Race breaker: someone may have transmitted us */ if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) rpc_wake_up_queued_task_set_status(&xprt->sending, task, 0); return false; } if (atomic_read(&xprt->swapper)) /* This will be clear in __rpc_execute */ current->flags |= PF_MEMALLOC; return true; } void xprt_end_transmit(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; xprt_inject_disconnect(xprt); xprt_release_write(xprt, task); } /** * xprt_request_transmit - send an RPC request on a transport * @req: pointer to request to transmit * @snd_task: RPC task that owns the transport lock * * This performs the transmission of a single request. * Note that if the request is not the same as snd_task, then it * does need to be pinned. * Returns '0' on success. */ static int xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) { struct rpc_xprt *xprt = req->rq_xprt; struct rpc_task *task = req->rq_task; unsigned int connect_cookie; int is_retrans = RPC_WAS_SENT(task); int status; if (test_bit(XPRT_CLOSE_WAIT, &xprt->state)) return -ENOTCONN; if (!req->rq_bytes_sent) { if (xprt_request_data_received(task)) { status = 0; goto out_dequeue; } /* Verify that our message lies in the RPCSEC_GSS window */ if (rpcauth_xmit_need_reencode(task)) { status = -EBADMSG; goto out_dequeue; } if (RPC_SIGNALLED(task)) { status = -ERESTARTSYS; goto out_dequeue; } } /* * Update req->rq_ntrans before transmitting to avoid races with * xprt_update_rtt(), which needs to know that it is recording a * reply to the first transmission. */ req->rq_ntrans++; trace_rpc_xdr_sendto(task, &req->rq_snd_buf); connect_cookie = xprt->connect_cookie; status = xprt->ops->send_request(req); if (status != 0) { req->rq_ntrans--; trace_xprt_transmit(req, status); return status; } if (is_retrans) { task->tk_client->cl_stats->rpcretrans++; trace_xprt_retransmit(req); } xprt_inject_disconnect(xprt); task->tk_flags |= RPC_TASK_SENT; spin_lock(&xprt->transport_lock); xprt->stat.sends++; xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; xprt->stat.bklog_u += xprt->backlog.qlen; xprt->stat.sending_u += xprt->sending.qlen; xprt->stat.pending_u += xprt->pending.qlen; spin_unlock(&xprt->transport_lock); req->rq_connect_cookie = connect_cookie; out_dequeue: trace_xprt_transmit(req, status); xprt_request_dequeue_transmit(task); rpc_wake_up_queued_task_set_status(&xprt->sending, task, status); return status; } /** * xprt_transmit - send an RPC request on a transport * @task: controlling RPC task * * Attempts to drain the transmit queue. On exit, either the transport * signalled an error that needs to be handled before transmission can * resume, or @task finished transmitting, and detected that it already * received a reply. */ void xprt_transmit(struct rpc_task *task) { struct rpc_rqst *next, *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; int status; spin_lock(&xprt->queue_lock); for (;;) { next = list_first_entry_or_null(&xprt->xmit_queue, struct rpc_rqst, rq_xmit); if (!next) break; xprt_pin_rqst(next); spin_unlock(&xprt->queue_lock); status = xprt_request_transmit(next, task); if (status == -EBADMSG && next != req) status = 0; spin_lock(&xprt->queue_lock); xprt_unpin_rqst(next); if (status < 0) { if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) task->tk_status = status; break; } /* Was @task transmitted, and has it received a reply? */ if (xprt_request_data_received(task) && !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) break; cond_resched_lock(&xprt->queue_lock); } spin_unlock(&xprt->queue_lock); } static void xprt_complete_request_init(struct rpc_task *task) { if (task->tk_rqstp) xprt_request_init(task); } void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) { set_bit(XPRT_CONGESTED, &xprt->state); rpc_sleep_on(&xprt->backlog, task, xprt_complete_request_init); } EXPORT_SYMBOL_GPL(xprt_add_backlog); static bool __xprt_set_rq(struct rpc_task *task, void *data) { struct rpc_rqst *req = data; if (task->tk_rqstp == NULL) { memset(req, 0, sizeof(*req)); /* mark unused */ task->tk_rqstp = req; return true; } return false; } bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req) { if (rpc_wake_up_first(&xprt->backlog, __xprt_set_rq, req) == NULL) { clear_bit(XPRT_CONGESTED, &xprt->state); return false; } return true; } EXPORT_SYMBOL_GPL(xprt_wake_up_backlog); static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task) { bool ret = false; if (!test_bit(XPRT_CONGESTED, &xprt->state)) goto out; spin_lock(&xprt->reserve_lock); if (test_bit(XPRT_CONGESTED, &xprt->state)) { xprt_add_backlog(xprt, task); ret = true; } spin_unlock(&xprt->reserve_lock); out: return ret; } static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt) { struct rpc_rqst *req = ERR_PTR(-EAGAIN); if (xprt->num_reqs >= xprt->max_reqs) goto out; ++xprt->num_reqs; spin_unlock(&xprt->reserve_lock); req = kzalloc(sizeof(*req), rpc_task_gfp_mask()); spin_lock(&xprt->reserve_lock); if (req != NULL) goto out; --xprt->num_reqs; req = ERR_PTR(-ENOMEM); out: return req; } static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { if (xprt->num_reqs > xprt->min_reqs) { --xprt->num_reqs; kfree(req); return true; } return false; } void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req; spin_lock(&xprt->reserve_lock); if (!list_empty(&xprt->free)) { req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); list_del(&req->rq_list); goto out_init_req; } req = xprt_dynamic_alloc_slot(xprt); if (!IS_ERR(req)) goto out_init_req; switch (PTR_ERR(req)) { case -ENOMEM: dprintk("RPC: dynamic allocation of request slot " "failed! Retrying\n"); task->tk_status = -ENOMEM; break; case -EAGAIN: xprt_add_backlog(xprt, task); dprintk("RPC: waiting for request slot\n"); fallthrough; default: task->tk_status = -EAGAIN; } spin_unlock(&xprt->reserve_lock); return; out_init_req: xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots, xprt->num_reqs); spin_unlock(&xprt->reserve_lock); task->tk_status = 0; task->tk_rqstp = req; } EXPORT_SYMBOL_GPL(xprt_alloc_slot); void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { spin_lock(&xprt->reserve_lock); if (!xprt_wake_up_backlog(xprt, req) && !xprt_dynamic_free_slot(xprt, req)) { memset(req, 0, sizeof(*req)); /* mark unused */ list_add(&req->rq_list, &xprt->free); } spin_unlock(&xprt->reserve_lock); } EXPORT_SYMBOL_GPL(xprt_free_slot); static void xprt_free_all_slots(struct rpc_xprt *xprt) { struct rpc_rqst *req; while (!list_empty(&xprt->free)) { req = list_first_entry(&xprt->free, struct rpc_rqst, rq_list); list_del(&req->rq_list); kfree(req); } } static DEFINE_IDA(rpc_xprt_ids); void xprt_cleanup_ids(void) { ida_destroy(&rpc_xprt_ids); } static int xprt_alloc_id(struct rpc_xprt *xprt) { int id; id = ida_alloc(&rpc_xprt_ids, GFP_KERNEL); if (id < 0) return id; xprt->id = id; return 0; } static void xprt_free_id(struct rpc_xprt *xprt) { ida_free(&rpc_xprt_ids, xprt->id); } struct rpc_xprt *xprt_alloc(struct net *net, size_t size, unsigned int num_prealloc, unsigned int max_alloc) { struct rpc_xprt *xprt; struct rpc_rqst *req; int i; xprt = kzalloc(size, GFP_KERNEL); if (xprt == NULL) goto out; xprt_alloc_id(xprt); xprt_init(xprt, net); for (i = 0; i < num_prealloc; i++) { req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL); if (!req) goto out_free; list_add(&req->rq_list, &xprt->free); } xprt->max_reqs = max_t(unsigned int, max_alloc, num_prealloc); xprt->min_reqs = num_prealloc; xprt->num_reqs = num_prealloc; return xprt; out_free: xprt_free(xprt); out: return NULL; } EXPORT_SYMBOL_GPL(xprt_alloc); void xprt_free(struct rpc_xprt *xprt) { put_net_track(xprt->xprt_net, &xprt->ns_tracker); xprt_free_all_slots(xprt); xprt_free_id(xprt); rpc_sysfs_xprt_destroy(xprt); kfree_rcu(xprt, rcu); } EXPORT_SYMBOL_GPL(xprt_free); static void xprt_init_connect_cookie(struct rpc_rqst *req, struct rpc_xprt *xprt) { req->rq_connect_cookie = xprt_connect_cookie(xprt) - 1; } static __be32 xprt_alloc_xid(struct rpc_xprt *xprt) { __be32 xid; spin_lock(&xprt->reserve_lock); xid = (__force __be32)xprt->xid++; spin_unlock(&xprt->reserve_lock); return xid; } static void xprt_init_xid(struct rpc_xprt *xprt) { xprt->xid = get_random_u32(); } static void xprt_request_init(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; req->rq_task = task; req->rq_xprt = xprt; req->rq_buffer = NULL; req->rq_xid = xprt_alloc_xid(xprt); xprt_init_connect_cookie(req, xprt); req->rq_snd_buf.len = 0; req->rq_snd_buf.buflen = 0; req->rq_rcv_buf.len = 0; req->rq_rcv_buf.buflen = 0; req->rq_snd_buf.bvec = NULL; req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; xprt_init_majortimeo(task, req, task->tk_client->cl_timeout); trace_xprt_reserve(req); } static void xprt_do_reserve(struct rpc_xprt *xprt, struct rpc_task *task) { xprt->ops->alloc_slot(xprt, task); if (task->tk_rqstp != NULL) xprt_request_init(task); } /** * xprt_reserve - allocate an RPC request slot * @task: RPC task requesting a slot allocation * * If the transport is marked as being congested, or if no more * slots are available, place the task on the transport's * backlog queue. */ void xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; task->tk_status = 0; if (task->tk_rqstp != NULL) return; task->tk_status = -EAGAIN; if (!xprt_throttle_congested(xprt, task)) xprt_do_reserve(xprt, task); } /** * xprt_retry_reserve - allocate an RPC request slot * @task: RPC task requesting a slot allocation * * If no more slots are available, place the task on the transport's * backlog queue. * Note that the only difference with xprt_reserve is that we now * ignore the value of the XPRT_CONGESTED flag. */ void xprt_retry_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; task->tk_status = 0; if (task->tk_rqstp != NULL) return; task->tk_status = -EAGAIN; xprt_do_reserve(xprt, task); } /** * xprt_release - release an RPC request slot * @task: task which is finished with the slot * */ void xprt_release(struct rpc_task *task) { struct rpc_xprt *xprt; struct rpc_rqst *req = task->tk_rqstp; if (req == NULL) { if (task->tk_client) { xprt = task->tk_xprt; xprt_release_write(xprt, task); } return; } xprt = req->rq_xprt; xprt_request_dequeue_xprt(task); spin_lock(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) xprt->ops->release_request(task); xprt_schedule_autodisconnect(xprt); spin_unlock(&xprt->transport_lock); if (req->rq_buffer) xprt->ops->buf_free(task); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); if (req->rq_release_snd_buf) req->rq_release_snd_buf(req); task->tk_rqstp = NULL; if (likely(!bc_prealloc(req))) xprt->ops->free_slot(xprt, req); else xprt_free_bc_request(req); } #ifdef CONFIG_SUNRPC_BACKCHANNEL void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task, const struct rpc_timeout *to) { struct xdr_buf *xbufp = &req->rq_snd_buf; task->tk_rqstp = req; req->rq_task = task; xprt_init_connect_cookie(req, req->rq_xprt); /* * Set up the xdr_buf length. * This also indicates that the buffer is XDR encoded already. */ xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + xbufp->tail[0].iov_len; /* * Backchannel Replies are sent with !RPC_TASK_SOFT and * RPC_TASK_NO_RETRANS_TIMEOUT. The major timeout setting * affects only how long each Reply waits to be sent when * a transport connection cannot be established. */ xprt_init_majortimeo(task, req, to); } #endif static void xprt_init(struct rpc_xprt *xprt, struct net *net) { kref_init(&xprt->kref); spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); spin_lock_init(&xprt->queue_lock); INIT_LIST_HEAD(&xprt->free); xprt->recv_queue = RB_ROOT; INIT_LIST_HEAD(&xprt->xmit_queue); #if defined(CONFIG_SUNRPC_BACKCHANNEL) spin_lock_init(&xprt->bc_pa_lock); INIT_LIST_HEAD(&xprt->bc_pa_list); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ INIT_LIST_HEAD(&xprt->xprt_switch); xprt->last_used = jiffies; xprt->cwnd = RPC_INITCWND; xprt->bind_index = 0; rpc_init_wait_queue(&xprt->binding, "xprt_binding"); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); rpc_init_wait_queue(&xprt->sending, "xprt_sending"); rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); xprt_init_xid(xprt); xprt->xprt_net = get_net_track(net, &xprt->ns_tracker, GFP_KERNEL); } /** * xprt_create_transport - create an RPC transport * @args: rpc transport creation arguments * */ struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { struct rpc_xprt *xprt; const struct xprt_class *t; t = xprt_class_find_by_ident(args->ident); if (!t) { dprintk("RPC: transport (%d) not supported\n", args->ident); return ERR_PTR(-EIO); } xprt = t->setup(args); xprt_class_release(t); if (IS_ERR(xprt)) goto out; if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT) xprt->idle_timeout = 0; INIT_WORK(&xprt->task_cleanup, xprt_autoclose); if (xprt_has_timer(xprt)) timer_setup(&xprt->timer, xprt_init_autodisconnect, 0); else timer_setup(&xprt->timer, NULL, 0); if (strlen(args->servername) > RPC_MAXNETNAMELEN) { xprt_destroy(xprt); return ERR_PTR(-EINVAL); } xprt->servername = kstrdup(args->servername, GFP_KERNEL); if (xprt->servername == NULL) { xprt_destroy(xprt); return ERR_PTR(-ENOMEM); } rpc_xprt_debugfs_register(xprt); trace_xprt_create(xprt); out: return xprt; } static void xprt_destroy_cb(struct work_struct *work) { struct rpc_xprt *xprt = container_of(work, struct rpc_xprt, task_cleanup); trace_xprt_destroy(xprt); rpc_xprt_debugfs_unregister(xprt); rpc_destroy_wait_queue(&xprt->binding); rpc_destroy_wait_queue(&xprt->pending); rpc_destroy_wait_queue(&xprt->sending); rpc_destroy_wait_queue(&xprt->backlog); kfree(xprt->servername); /* * Destroy any existing back channel */ xprt_destroy_backchannel(xprt, UINT_MAX); /* * Tear down transport state and free the rpc_xprt */ xprt->ops->destroy(xprt); } /** * xprt_destroy - destroy an RPC transport, killing off all requests. * @xprt: transport to destroy * */ static void xprt_destroy(struct rpc_xprt *xprt) { /* * Exclude transport connect/disconnect handlers and autoclose */ wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_UNINTERRUPTIBLE); /* * xprt_schedule_autodisconnect() can run after XPRT_LOCKED * is cleared. We use ->transport_lock to ensure the mod_timer() * can only run *before* del_time_sync(), never after. */ spin_lock(&xprt->transport_lock); del_timer_sync(&xprt->timer); spin_unlock(&xprt->transport_lock); /* * Destroy sockets etc from the system workqueue so they can * safely flush receive work running on rpciod. */ INIT_WORK(&xprt->task_cleanup, xprt_destroy_cb); schedule_work(&xprt->task_cleanup); } static void xprt_destroy_kref(struct kref *kref) { xprt_destroy(container_of(kref, struct rpc_xprt, kref)); } /** * xprt_get - return a reference to an RPC transport. * @xprt: pointer to the transport * */ struct rpc_xprt *xprt_get(struct rpc_xprt *xprt) { if (xprt != NULL && kref_get_unless_zero(&xprt->kref)) return xprt; return NULL; } EXPORT_SYMBOL_GPL(xprt_get); /** * xprt_put - release a reference to an RPC transport. * @xprt: pointer to the transport * */ void xprt_put(struct rpc_xprt *xprt) { if (xprt != NULL) kref_put(&xprt->kref, xprt_destroy_kref); } EXPORT_SYMBOL_GPL(xprt_put); void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) { if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) { spin_lock(&xps->xps_lock); xps->xps_nactive--; spin_unlock(&xps->xps_lock); } } void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) { if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) { spin_lock(&xps->xps_lock); xps->xps_nactive++; spin_unlock(&xps->xps_lock); } } void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) { if (test_and_set_bit(XPRT_REMOVE, &xprt->state)) return; xprt_force_disconnect(xprt); if (!test_bit(XPRT_CONNECTED, &xprt->state)) return; if (!xprt->sending.qlen && !xprt->pending.qlen && !xprt->backlog.qlen && !atomic_long_read(&xprt->queuelen)) rpc_xprt_switch_remove_xprt(xps, xprt, true); }
82 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 /* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* Copyright (c) 2002-2007 Volkswagen Group Electronic Research * Copyright (c) 2017 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de> * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #ifndef CAN_ML_H #define CAN_ML_H #include <linux/can.h> #include <linux/list.h> #include <linux/netdevice.h> #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) #define CAN_EFF_RCV_HASH_BITS 10 #define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS) enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX }; struct can_dev_rcv_lists { struct hlist_head rx[RX_MAX]; struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ]; struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ]; int entries; }; struct can_ml_priv { struct can_dev_rcv_lists dev_rcv_lists; #ifdef CAN_J1939 struct j1939_priv *j1939_priv; #endif }; static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev) { return netdev_get_ml_priv(dev, ML_PRIV_CAN); } static inline void can_set_ml_priv(struct net_device *dev, struct can_ml_priv *ml_priv) { netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN); } #endif /* CAN_ML_H */
2 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 1 1 2 3 2 2 3 3 2 1 1 1 1 2 2 1 2 3 3 3 3 3 3 4 4 4 4 4 2 2 4 4 2 46 1 1 1 1 5 4 4 4 4 1 2 1 1 3 37 46 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 // SPDX-License-Identifier: GPL-2.0-only /* By Ross Biro 1/23/92 */ /* * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/ptrace.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/security.h> #include <linux/audit.h> #include <linux/seccomp.h> #include <linux/signal.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/context_tracking.h> #include <linux/nospec.h> #include <linux/uaccess.h> #include <asm/processor.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/ldt.h> #include <asm/desc.h> #include <asm/prctl.h> #include <asm/proto.h> #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/syscall.h> #include <asm/fsgsbase.h> #include <asm/io_bitmap.h> #include "tls.h" enum x86_regset_32 { REGSET32_GENERAL, REGSET32_FP, REGSET32_XFP, REGSET32_XSTATE, REGSET32_TLS, REGSET32_IOPERM, }; enum x86_regset_64 { REGSET64_GENERAL, REGSET64_FP, REGSET64_IOPERM, REGSET64_XSTATE, REGSET64_SSP, }; #define REGSET_GENERAL \ ({ \ BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \ REGSET32_GENERAL; \ }) #define REGSET_FP \ ({ \ BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \ REGSET32_FP; \ }) struct pt_regs_offset { const char *name; int offset; }; #define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} #define REG_OFFSET_END {.name = NULL, .offset = 0} static const struct pt_regs_offset regoffset_table[] = { #ifdef CONFIG_X86_64 REG_OFFSET_NAME(r15), REG_OFFSET_NAME(r14), REG_OFFSET_NAME(r13), REG_OFFSET_NAME(r12), REG_OFFSET_NAME(r11), REG_OFFSET_NAME(r10), REG_OFFSET_NAME(r9), REG_OFFSET_NAME(r8), #endif REG_OFFSET_NAME(bx), REG_OFFSET_NAME(cx), REG_OFFSET_NAME(dx), REG_OFFSET_NAME(si), REG_OFFSET_NAME(di), REG_OFFSET_NAME(bp), REG_OFFSET_NAME(ax), #ifdef CONFIG_X86_32 REG_OFFSET_NAME(ds), REG_OFFSET_NAME(es), REG_OFFSET_NAME(fs), REG_OFFSET_NAME(gs), #endif REG_OFFSET_NAME(orig_ax), REG_OFFSET_NAME(ip), REG_OFFSET_NAME(cs), REG_OFFSET_NAME(flags), REG_OFFSET_NAME(sp), REG_OFFSET_NAME(ss), REG_OFFSET_END, }; /** * regs_query_register_offset() - query register offset from its name * @name: the name of a register * * regs_query_register_offset() returns the offset of a register in struct * pt_regs from its name. If the name is invalid, this returns -EINVAL; */ int regs_query_register_offset(const char *name) { const struct pt_regs_offset *roff; for (roff = regoffset_table; roff->name != NULL; roff++) if (!strcmp(roff->name, name)) return roff->offset; return -EINVAL; } /** * regs_query_register_name() - query register name from its offset * @offset: the offset of a register in struct pt_regs. * * regs_query_register_name() returns the name of a register from its * offset in struct pt_regs. If the @offset is invalid, this returns NULL; */ const char *regs_query_register_name(unsigned int offset) { const struct pt_regs_offset *roff; for (roff = regoffset_table; roff->name != NULL; roff++) if (roff->offset == offset) return roff->name; return NULL; } /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. */ /* * Determines which flags the user has access to [1 = access, 0 = no access]. */ #define FLAG_MASK_32 ((unsigned long) \ (X86_EFLAGS_CF | X86_EFLAGS_PF | \ X86_EFLAGS_AF | X86_EFLAGS_ZF | \ X86_EFLAGS_SF | X86_EFLAGS_TF | \ X86_EFLAGS_DF | X86_EFLAGS_OF | \ X86_EFLAGS_RF | X86_EFLAGS_AC)) /* * Determines whether a value may be installed in a segment register. */ static inline bool invalid_selector(u16 value) { return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL); } #ifdef CONFIG_X86_32 #define FLAG_MASK FLAG_MASK_32 static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); return &regs->bx + (regno >> 2); } static u16 get_segment_reg(struct task_struct *task, unsigned long offset) { /* * Returning the value truncates it to 16 bits. */ unsigned int retval; if (offset != offsetof(struct user_regs_struct, gs)) retval = *pt_regs_access(task_pt_regs(task), offset); else { if (task == current) savesegment(gs, retval); else retval = task->thread.gs; } return retval; } static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { if (WARN_ON_ONCE(task == current)) return -EIO; /* * The value argument was already truncated to 16 bits. */ if (invalid_selector(value)) return -EIO; /* * For %cs and %ss we cannot permit a null selector. * We can permit a bogus selector as long as it has USER_RPL. * Null selectors are fine for other segment registers, but * we will never get back to user mode with invalid %cs or %ss * and will take the trap in iret instead. Much code relies * on user_mode() to distinguish a user trap frame (which can * safely use invalid selectors) from a kernel trap frame. */ switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ss): if (unlikely(value == 0)) return -EIO; fallthrough; default: *pt_regs_access(task_pt_regs(task), offset) = value; break; case offsetof(struct user_regs_struct, gs): task->thread.gs = value; } return 0; } #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset) { BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0); return &regs->r15 + (offset / sizeof(regs->r15)); } static u16 get_segment_reg(struct task_struct *task, unsigned long offset) { /* * Returning the value truncates it to 16 bits. */ unsigned int seg; switch (offset) { case offsetof(struct user_regs_struct, fs): if (task == current) { /* Older gas can't assemble movq %?s,%r?? */ asm("movl %%fs,%0" : "=r" (seg)); return seg; } return task->thread.fsindex; case offsetof(struct user_regs_struct, gs): if (task == current) { asm("movl %%gs,%0" : "=r" (seg)); return seg; } return task->thread.gsindex; case offsetof(struct user_regs_struct, ds): if (task == current) { asm("movl %%ds,%0" : "=r" (seg)); return seg; } return task->thread.ds; case offsetof(struct user_regs_struct, es): if (task == current) { asm("movl %%es,%0" : "=r" (seg)); return seg; } return task->thread.es; case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ss): break; } return *pt_regs_access(task_pt_regs(task), offset); } static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { if (WARN_ON_ONCE(task == current)) return -EIO; /* * The value argument was already truncated to 16 bits. */ if (invalid_selector(value)) return -EIO; /* * Writes to FS and GS will change the stored selector. Whether * this changes the segment base as well depends on whether * FSGSBASE is enabled. */ switch (offset) { case offsetof(struct user_regs_struct,fs): task->thread.fsindex = value; break; case offsetof(struct user_regs_struct,gs): task->thread.gsindex = value; break; case offsetof(struct user_regs_struct,ds): task->thread.ds = value; break; case offsetof(struct user_regs_struct,es): task->thread.es = value; break; /* * Can't actually change these in 64-bit mode. */ case offsetof(struct user_regs_struct,cs): if (unlikely(value == 0)) return -EIO; task_pt_regs(task)->cs = value; break; case offsetof(struct user_regs_struct,ss): if (unlikely(value == 0)) return -EIO; task_pt_regs(task)->ss = value; break; } return 0; } #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) { unsigned long retval = task_pt_regs(task)->flags; /* * If the debugger set TF, hide it from the readout. */ if (test_tsk_thread_flag(task, TIF_FORCED_TF)) retval &= ~X86_EFLAGS_TF; return retval; } static int set_flags(struct task_struct *task, unsigned long value) { struct pt_regs *regs = task_pt_regs(task); /* * If the user value contains TF, mark that * it was not "us" (the debugger) that set it. * If not, make sure it stays set if we had. */ if (value & X86_EFLAGS_TF) clear_tsk_thread_flag(task, TIF_FORCED_TF); else if (test_tsk_thread_flag(task, TIF_FORCED_TF)) value |= X86_EFLAGS_TF; regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK); return 0; } static int putreg(struct task_struct *child, unsigned long offset, unsigned long value) { switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ds): case offsetof(struct user_regs_struct, es): case offsetof(struct user_regs_struct, fs): case offsetof(struct user_regs_struct, gs): case offsetof(struct user_regs_struct, ss): return set_segment_reg(child, offset, value); case offsetof(struct user_regs_struct, flags): return set_flags(child, value); #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): if (value >= TASK_SIZE_MAX) return -EIO; x86_gsbase_write_task(child, value); return 0; #endif } *pt_regs_access(task_pt_regs(child), offset) = value; return 0; } static unsigned long getreg(struct task_struct *task, unsigned long offset) { switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ds): case offsetof(struct user_regs_struct, es): case offsetof(struct user_regs_struct, fs): case offsetof(struct user_regs_struct, gs): case offsetof(struct user_regs_struct, ss): return get_segment_reg(task, offset); case offsetof(struct user_regs_struct, flags): return get_flags(task); #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct, fs_base): return x86_fsbase_read_task(task); case offsetof(struct user_regs_struct, gs_base): return x86_gsbase_read_task(task); #endif } return *pt_regs_access(task_pt_regs(task), offset); } static int genregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int reg; for (reg = 0; to.left; reg++) membuf_store(&to, getreg(target, reg * sizeof(unsigned long))); return 0; } static int genregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret = 0; if (kbuf) { const unsigned long *k = kbuf; while (count >= sizeof(*k) && !ret) { ret = putreg(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const unsigned long __user *u = ubuf; while (count >= sizeof(*u) && !ret) { unsigned long word; ret = __get_user(word, u++); if (ret) break; ret = putreg(target, pos, word); count -= sizeof(*u); pos += sizeof(*u); } } return ret; } static void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { int i; struct thread_struct *thread = &(current->thread); /* * Store in the virtual DR6 register the fact that the breakpoint * was hit so the thread's debugger will see it. */ for (i = 0; i < HBP_NUM; i++) { if (thread->ptrace_bps[i] == bp) break; } thread->virtual_dr6 |= (DR_TRAP0 << i); } /* * Walk through every ptrace breakpoints for this thread and * build the dr7 value on top of their attributes. * */ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) { int i; int dr7 = 0; struct arch_hw_breakpoint *info; for (i = 0; i < HBP_NUM; i++) { if (bp[i] && !bp[i]->attr.disabled) { info = counter_arch_bp(bp[i]); dr7 |= encode_dr7(i, info->len, info->type); } } return dr7; } static int ptrace_fill_bp_fields(struct perf_event_attr *attr, int len, int type, bool disabled) { int err, bp_len, bp_type; err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); if (!err) { attr->bp_len = bp_len; attr->bp_type = bp_type; attr->disabled = disabled; } return err; } static struct perf_event * ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, unsigned long addr, bool disabled) { struct perf_event_attr attr; int err; ptrace_breakpoint_init(&attr); attr.bp_addr = addr; err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) return ERR_PTR(err); return register_user_hw_breakpoint(&attr, ptrace_triggered, NULL, tsk); } static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, int disabled) { struct perf_event_attr attr = bp->attr; int err; err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) return err; return modify_user_hw_breakpoint(bp, &attr); } /* * Handle ptrace writes to debug register 7. */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { struct thread_struct *thread = &tsk->thread; unsigned long old_dr7; bool second_pass = false; int i, rc, ret = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: rc = 0; for (i = 0; i < HBP_NUM; i++) { unsigned len, type; bool disabled = !decode_dr7(data, i, &len, &type); struct perf_event *bp = thread->ptrace_bps[i]; if (!bp) { if (disabled) continue; bp = ptrace_register_breakpoint(tsk, len, type, 0, disabled); if (IS_ERR(bp)) { rc = PTR_ERR(bp); break; } thread->ptrace_bps[i] = bp; continue; } rc = ptrace_modify_breakpoint(bp, len, type, disabled); if (rc) break; } /* Restore if the first pass failed, second_pass shouldn't fail. */ if (rc && !WARN_ON(second_pass)) { ret = rc; data = old_dr7; second_pass = true; goto restore; } return ret; } /* * Handle PTRACE_PEEKUSR calls for the debug register area. */ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { struct thread_struct *thread = &tsk->thread; unsigned long val = 0; if (n < HBP_NUM) { int index = array_index_nospec(n, HBP_NUM); struct perf_event *bp = thread->ptrace_bps[index]; if (bp) val = bp->hw.info.address; } else if (n == 6) { val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */ } else if (n == 7) { val = thread->ptrace_dr7; } return val; } static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, unsigned long addr) { struct thread_struct *t = &tsk->thread; struct perf_event *bp = t->ptrace_bps[nr]; int err = 0; if (!bp) { /* * Put stub len and type to create an inactive but correct bp. * * CHECKME: the previous code returned -EIO if the addr wasn't * a valid task virtual addr. The new one will return -EINVAL in * this case. * -EINVAL may be what we want for in-kernel breakpoints users, * but -EIO looks better for ptrace, since we refuse a register * writing for the user. And anyway this is the previous * behaviour. */ bp = ptrace_register_breakpoint(tsk, X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, addr, true); if (IS_ERR(bp)) err = PTR_ERR(bp); else t->ptrace_bps[nr] = bp; } else { struct perf_event_attr attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } return err; } /* * Handle PTRACE_POKEUSR calls for the debug register area. */ static int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { struct thread_struct *thread = &tsk->thread; /* There are no DR4 or DR5 registers */ int rc = -EIO; if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); } else if (n == 6) { thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */ rc = 0; } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); if (!rc) thread->ptrace_dr7 = val; } return rc; } /* * These access the current or another (stopped) task's io permission * bitmap for debugging or core dump. */ static int ioperm_active(struct task_struct *target, const struct user_regset *regset) { struct io_bitmap *iobm = target->thread.io_bitmap; return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0; } static int ioperm_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct io_bitmap *iobm = target->thread.io_bitmap; if (!iobm) return -ENXIO; return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES); } /* * Called by kernel/ptrace.c when detaching.. * * Make sure the single step bit is not set. */ void ptrace_disable(struct task_struct *child) { user_disable_single_step(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif #ifdef CONFIG_X86_64 static const struct user_regset_view user_x86_64_view; /* Initialized below. */ #endif long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { int ret; unsigned long __user *datap = (unsigned long __user *)data; #ifdef CONFIG_X86_64 /* This is native 64-bit ptrace() */ const struct user_regset_view *regset_view = &user_x86_64_view; #else /* This is native 32-bit ptrace() */ const struct user_regset_view *regset_view = &user_x86_32_view; #endif switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { unsigned long tmp; ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; tmp = 0; /* Default return condition */ if (addr < sizeof(struct user_regs_struct)) tmp = getreg(child, addr); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); tmp = ptrace_get_debugreg(child, addr / sizeof(data)); } ret = put_user(tmp, datap); break; } case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; if (addr < sizeof(struct user_regs_struct)) ret = putreg(child, addr, data); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); ret = ptrace_set_debugreg(child, addr / sizeof(data), data); } break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); #ifdef CONFIG_X86_32 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case PTRACE_GET_THREAD_AREA: if ((int) addr < 0) return -EIO; ret = do_get_thread_area(child, addr, (struct user_desc __user *)data); break; case PTRACE_SET_THREAD_AREA: if ((int) addr < 0) return -EIO; ret = do_set_thread_area(child, addr, (struct user_desc __user *)data, 0); break; #endif #ifdef CONFIG_X86_64 /* normal 64bit interface to access TLS data. Works just like arch_prctl, except that the arguments are reversed. */ case PTRACE_ARCH_PRCTL: ret = do_arch_prctl_64(child, data, addr); break; #endif default: ret = ptrace_request(child, request, addr, data); break; } return ret; } #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> #include <linux/syscalls.h> #include <asm/ia32.h> #include <asm/user32.h> #define R32(l,q) \ case offsetof(struct user32, regs.l): \ regs->q = value; break #define SEG32(rs) \ case offsetof(struct user32, regs.rs): \ return set_segment_reg(child, \ offsetof(struct user_regs_struct, rs), \ value); \ break static int putreg32(struct task_struct *child, unsigned regno, u32 value) { struct pt_regs *regs = task_pt_regs(child); int ret; switch (regno) { SEG32(cs); SEG32(ds); SEG32(es); /* * A 32-bit ptracer on a 64-bit kernel expects that writing * FS or GS will also update the base. This is needed for * operations like PTRACE_SETREGS to fully restore a saved * CPU state. */ case offsetof(struct user32, regs.fs): ret = set_segment_reg(child, offsetof(struct user_regs_struct, fs), value); if (ret == 0) child->thread.fsbase = x86_fsgsbase_read_task(child, value); return ret; case offsetof(struct user32, regs.gs): ret = set_segment_reg(child, offsetof(struct user_regs_struct, gs), value); if (ret == 0) child->thread.gsbase = x86_fsgsbase_read_task(child, value); return ret; SEG32(ss); R32(ebx, bx); R32(ecx, cx); R32(edx, dx); R32(edi, di); R32(esi, si); R32(ebp, bp); R32(eax, ax); R32(eip, ip); R32(esp, sp); case offsetof(struct user32, regs.orig_eax): /* * Warning: bizarre corner case fixup here. A 32-bit * debugger setting orig_eax to -1 wants to disable * syscall restart. Make sure that the syscall * restart code sign-extends orig_ax. Also make sure * we interpret the -ERESTART* codes correctly if * loaded into regs->ax in case the task is not * actually still sitting at the exit from a 32-bit * syscall with TS_COMPAT still set. */ regs->orig_ax = value; if (syscall_get_nr(child, regs) != -1) child->thread_info.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): return set_flags(child, value); case offsetof(struct user32, u_debugreg[0]) ... offsetof(struct user32, u_debugreg[7]): regno -= offsetof(struct user32, u_debugreg[0]); return ptrace_set_debugreg(child, regno / 4, value); default: if (regno > sizeof(struct user32) || (regno & 3)) return -EIO; /* * Other dummy fields in the virtual user structure * are ignored */ break; } return 0; } #undef R32 #undef SEG32 #define R32(l,q) \ case offsetof(struct user32, regs.l): \ *val = regs->q; break #define SEG32(rs) \ case offsetof(struct user32, regs.rs): \ *val = get_segment_reg(child, \ offsetof(struct user_regs_struct, rs)); \ break static int getreg32(struct task_struct *child, unsigned regno, u32 *val) { struct pt_regs *regs = task_pt_regs(child); switch (regno) { SEG32(ds); SEG32(es); SEG32(fs); SEG32(gs); R32(cs, cs); R32(ss, ss); R32(ebx, bx); R32(ecx, cx); R32(edx, dx); R32(edi, di); R32(esi, si); R32(ebp, bp); R32(eax, ax); R32(orig_eax, orig_ax); R32(eip, ip); R32(esp, sp); case offsetof(struct user32, regs.eflags): *val = get_flags(child); break; case offsetof(struct user32, u_debugreg[0]) ... offsetof(struct user32, u_debugreg[7]): regno -= offsetof(struct user32, u_debugreg[0]); *val = ptrace_get_debugreg(child, regno / 4); break; default: if (regno > sizeof(struct user32) || (regno & 3)) return -EIO; /* * Other dummy fields in the virtual user structure * are ignored */ *val = 0; break; } return 0; } #undef R32 #undef SEG32 static int genregs32_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int reg; for (reg = 0; to.left; reg++) { u32 val; getreg32(target, reg * 4, &val); membuf_store(&to, val); } return 0; } static int genregs32_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret = 0; if (kbuf) { const compat_ulong_t *k = kbuf; while (count >= sizeof(*k) && !ret) { ret = putreg32(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const compat_ulong_t __user *u = ubuf; while (count >= sizeof(*u) && !ret) { compat_ulong_t word; ret = __get_user(word, u++); if (ret) break; ret = putreg32(target, pos, word); count -= sizeof(*u); pos += sizeof(*u); } } return ret; } static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { unsigned long addr = caddr; unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; __u32 val; switch (request) { case PTRACE_PEEKUSR: ret = getreg32(child, addr, &val); if (ret == 0) ret = put_user(val, (__u32 __user *)datap); break; case PTRACE_POKEUSR: ret = putreg32(child, addr, data); break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct32), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct32), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET_FP, 0, sizeof(struct user_i387_ia32_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user( child, &user_x86_32_view, REGSET_FP, 0, sizeof(struct user_i387_ia32_struct), datap); case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: return arch_ptrace(child, request, addr, data); default: return compat_ptrace_request(child, request, addr, data); } return ret; } #endif /* CONFIG_IA32_EMULATION */ #ifdef CONFIG_X86_X32_ABI static long x32_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { unsigned long addr = caddr; unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; switch (request) { /* Read 32bits at location addr in the USER area. Only allow to return the lower 32bits of segment and debug registers. */ case PTRACE_PEEKUSR: { u32 tmp; ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || addr < offsetof(struct user_regs_struct, cs)) break; tmp = 0; /* Default return condition */ if (addr < sizeof(struct user_regs_struct)) tmp = getreg(child, addr); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); tmp = ptrace_get_debugreg(child, addr / sizeof(data)); } ret = put_user(tmp, (__u32 __user *)datap); break; } /* Write the word at location addr in the USER area. Only allow to update segment and debug registers with the upper 32bits zero-extended. */ case PTRACE_POKEUSR: ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || addr < offsetof(struct user_regs_struct, cs)) break; if (addr < sizeof(struct user_regs_struct)) ret = putreg(child, addr, data); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); ret = ptrace_set_debugreg(child, addr / sizeof(data), data); } break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); default: return compat_ptrace_request(child, request, addr, data); } return ret; } #endif #ifdef CONFIG_COMPAT long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { #ifdef CONFIG_X86_X32_ABI if (!in_ia32_syscall()) return x32_arch_ptrace(child, request, caddr, cdata); #endif #ifdef CONFIG_IA32_EMULATION return ia32_arch_ptrace(child, request, caddr, cdata); #else return 0; #endif } #endif /* CONFIG_COMPAT */ #ifdef CONFIG_X86_64 static struct user_regset x86_64_regsets[] __ro_after_init = { [REGSET64_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), .regset_get = genregs_get, .set = genregs_set }, [REGSET64_FP] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct fxregs_state) / sizeof(long), .size = sizeof(long), .align = sizeof(long), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET64_XSTATE] = { .core_note_type = NT_X86_XSTATE, .size = sizeof(u64), .align = sizeof(u64), .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET64_IOPERM] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, .size = sizeof(long), .align = sizeof(long), .active = ioperm_active, .regset_get = ioperm_get }, #ifdef CONFIG_X86_USER_SHADOW_STACK [REGSET64_SSP] = { .core_note_type = NT_X86_SHSTK, .n = 1, .size = sizeof(u64), .align = sizeof(u64), .active = ssp_active, .regset_get = ssp_get, .set = ssp_set }, #endif }; static const struct user_regset_view user_x86_64_view = { .name = "x86_64", .e_machine = EM_X86_64, .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets) }; #else /* CONFIG_X86_32 */ #define user_regs_struct32 user_regs_struct #define genregs32_get genregs_get #define genregs32_set genregs_set #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static struct user_regset x86_32_regsets[] __ro_after_init = { [REGSET32_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = genregs32_get, .set = genregs32_set }, [REGSET32_FP] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set }, [REGSET32_XFP] = { .core_note_type = NT_PRXFPREG, .n = sizeof(struct fxregs_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET32_XSTATE] = { .core_note_type = NT_X86_XSTATE, .size = sizeof(u64), .align = sizeof(u64), .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET32_TLS] = { .core_note_type = NT_386_TLS, .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, .size = sizeof(struct user_desc), .align = sizeof(struct user_desc), .active = regset_tls_active, .regset_get = regset_tls_get, .set = regset_tls_set }, [REGSET32_IOPERM] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_BYTES / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = ioperm_active, .regset_get = ioperm_get }, }; static const struct user_regset_view user_x86_32_view = { .name = "i386", .e_machine = EM_386, .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets) }; #endif /* * This represents bytes 464..511 in the memory layout exported through * the REGSET_XSTATE interface. */ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64); #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64); #endif xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } /* * This is used by the core dump code to decide which regset to dump. The * core dump code writes out the resulting .e_machine and the corresponding * regsets. This is suboptimal if the task is messing around with its CS.L * field, but at worst the core dump will end up missing some information. * * Unfortunately, it is also used by the broken PTRACE_GETREGSET and * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have * no way to make sure that the e_machine they use matches the caller's * expectations. The result is that the data format returned by * PTRACE_GETREGSET depends on the returned CS field (and even the offset * of the returned CS field depends on its value!) and the data format * accepted by PTRACE_SETREGSET is determined by the old CS value. The * upshot is that it is basically impossible to use these APIs correctly. * * The best way to fix it in the long run would probably be to add new * improved ptrace() APIs to read and write registers reliably, possibly by * allowing userspace to select the ELF e_machine variant that they expect. */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION if (!user_64bit_mode(task_pt_regs(task))) #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION return &user_x86_32_view; #endif #ifdef CONFIG_X86_64 return &user_x86_64_view; #endif } void send_sigtrap(struct pt_regs *regs, int error_code, int si_code) { struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; /* Send us the fake SIGTRAP */ force_sig_fault(SIGTRAP, si_code, user_mode(regs) ? (void __user *)regs->ip : NULL); } void user_single_step_report(struct pt_regs *regs) { send_sigtrap(regs, 0, TRAP_BRKPT); }
8 4 4 8 8 8 1 8 4 8 8 257 253 228 170 169 258 179 176 156 156 160 82 179 162 179 126 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/types.h> #include "ntfs_fs.h" #define BITS_IN_SIZE_T (sizeof(size_t) * 8) /* * fill_mask[i] - first i bits are '1' , i = 0,1,2,3,4,5,6,7,8 * fill_mask[i] = 0xFF >> (8-i) */ static const u8 fill_mask[] = { 0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF }; /* * zero_mask[i] - first i bits are '0' , i = 0,1,2,3,4,5,6,7,8 * zero_mask[i] = 0xFF << i */ static const u8 zero_mask[] = { 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80, 0x00 }; /* * are_bits_clear * * Return: True if all bits [bit, bit+nbits) are zeros "0". */ bool are_bits_clear(const void *lmap, size_t bit, size_t nbits) { size_t pos = bit & 7; const u8 *map = (u8 *)lmap + (bit >> 3); if (pos) { if (8 - pos >= nbits) return !nbits || !(*map & fill_mask[pos + nbits] & zero_mask[pos]); if (*map++ & zero_mask[pos]) return false; nbits -= 8 - pos; } pos = ((size_t)map) & (sizeof(size_t) - 1); if (pos) { pos = sizeof(size_t) - pos; if (nbits >= pos * 8) { for (nbits -= pos * 8; pos; pos--, map++) { if (*map) return false; } } } for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) { if (*((size_t *)map)) return false; } for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) { if (*map) return false; } pos = nbits & 7; if (pos && (*map & fill_mask[pos])) return false; return true; } /* * are_bits_set * * Return: True if all bits [bit, bit+nbits) are ones "1". */ bool are_bits_set(const void *lmap, size_t bit, size_t nbits) { u8 mask; size_t pos = bit & 7; const u8 *map = (u8 *)lmap + (bit >> 3); if (pos) { if (8 - pos >= nbits) { mask = fill_mask[pos + nbits] & zero_mask[pos]; return !nbits || (*map & mask) == mask; } mask = zero_mask[pos]; if ((*map++ & mask) != mask) return false; nbits -= 8 - pos; } pos = ((size_t)map) & (sizeof(size_t) - 1); if (pos) { pos = sizeof(size_t) - pos; if (nbits >= pos * 8) { for (nbits -= pos * 8; pos; pos--, map++) { if (*map != 0xFF) return false; } } } for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) { if (*((size_t *)map) != MINUS_ONE_T) return false; } for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) { if (*map != 0xFF) return false; } pos = nbits & 7; if (pos) { mask = fill_mask[pos]; if ((*map & mask) != mask) return false; } return true; }
51 78 26 78 58 78 75 75 78 55 90 53 4 72 75 16 12 28 22 28 14 3 83 26 23 26 13 20 14 18 24 18 18 13 7 6 1 6 9 3 5 3 3 3 6 5 2 4 122 101 110 84 83 82 76 58 57 1 1 57 79 80 79 75 79 74 71 69 1 69 69 68 1 69 51 2 9 2 2 2 2 2 2 16 2 16 2 15 2 2 14 16 2 2 10 10 10 10 10 8 1 1 10 10 1 7 6 4 3 3 3 3 3 3 3 3 4 7 26 27 26 3 22 23 8 8 6 6 23 9 9 9 6 9 1 8 7 5 5 5 5 5 5 1 3 8 4 2 4 1 2 2 1 2 2 2 6 3 6 1 6 5 6 5 1 4 2 5 1 5 5 3 2 1 3 3 1 2 1 5 2 2 2 2 1 2 1 1 4 5 5 5 4 5 3 1 2 3 3 2 1 3 2 2 1 36 24 14 13 36 28 27 4 24 3 31 30 30 1 29 24 23 22 20 24 24 2 1 23 7 6 26 15 5 3 2 2 1 1 1 1 1 1 1 1 1 3 5 193 191 189 188 187 194 2 193 7 195 192 4 4 4 4 4 1 13 13 157 157 156 154 148 13 13 12 11 6 5 3 1 147 4 145 5 2 1 1 9 6 7 3 4 4 7 9 9 4 4 5 5 2 2 8 26 26 13 26 12 2 4 2 2 2 1 2 2 4 4 8 7 4 2 20 3 20 3 20 9 20 5 4 20 7 3 16 4 25 25 25 3 22 2 22 18 20 26 6 6 6 6 22 10 22 19 22 21 10 10 21 3 3 1 20 6 1 1 1 20 9 5 1 20 20 20 5 20 4 1 20 9 2 20 9 8 7 2 6 3 2 3 2 6 3 5 2 4 5 1 6 6 11 7 11 19 19 3 19 18 17 2 1 1 1 17 16 17 7 2 7 1 18 17 2 1 14 15 80 71 4 2 3 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 /* FUSE: Filesystem in Userspace Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. */ #include "fuse_i.h" #include <linux/pagemap.h> #include <linux/file.h> #include <linux/fs_context.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/iversion.h> #include <linux/posix_acl.h> #include <linux/security.h> #include <linux/types.h> #include <linux/kernel.h> static bool __read_mostly allow_sys_admin_access; module_param(allow_sys_admin_access, bool, 0644); MODULE_PARM_DESC(allow_sys_admin_access, "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } #if BITS_PER_LONG >= 64 static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) { entry->d_fsdata = (void *) time; } static inline u64 fuse_dentry_time(const struct dentry *entry) { return (u64)entry->d_fsdata; } #else union fuse_dentry { u64 time; struct rcu_head rcu; }; static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) { ((union fuse_dentry *) dentry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(const struct dentry *entry) { return ((union fuse_dentry *) entry->d_fsdata)->time; } #endif static void fuse_dentry_settime(struct dentry *dentry, u64 time) { struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); bool delete = !time && fc->delete_stale; /* * Mess with DCACHE_OP_DELETE because dput() will be faster without it. * Don't care about races, either way it's just an optimization */ if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) || (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) { spin_lock(&dentry->d_lock); if (!delete) dentry->d_flags &= ~DCACHE_OP_DELETE; else dentry->d_flags |= DCACHE_OP_DELETE; spin_unlock(&dentry->d_lock); } __fuse_dentry_settime(dentry, time); } /* * FUSE caches dentries and attributes with separate timeout. The * time in jiffies until the dentry/attributes are valid is stored in * dentry->d_fsdata and fuse_inode->i_time respectively. */ /* * Calculate the time in jiffies until a dentry/attributes are valid */ u64 fuse_time_to_jiffies(u64 sec, u32 nsec) { if (sec || nsec) { struct timespec64 ts = { sec, min_t(u32, nsec, NSEC_PER_SEC - 1) }; return get_jiffies_64() + timespec64_to_jiffies(&ts); } else return 0; } /* * Set dentry and possibly attribute timeouts from the lookup/mk* * replies */ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) { fuse_dentry_settime(entry, fuse_time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); } void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) { set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask); } /* * Mark the attributes as stale, so that at the next call to * ->getattr() they will be fetched from userspace */ void fuse_invalidate_attr(struct inode *inode) { fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS); } static void fuse_dir_changed(struct inode *dir) { fuse_invalidate_attr(dir); inode_maybe_inc_iversion(dir, false); } /* * Mark the attributes as stale due to an atime change. Avoid the invalidate if * atime is not used. */ void fuse_invalidate_atime(struct inode *inode) { if (!IS_RDONLY(inode)) fuse_invalidate_attr_mask(inode, STATX_ATIME); } /* * Just mark the entry as stale, so that a next attempt to look it up * will result in a new lookup call to userspace * * This is called when a dentry is about to become negative and the * timeout is unknown (unlink, rmdir, rename and in some cases * lookup) */ void fuse_invalidate_entry_cache(struct dentry *entry) { fuse_dentry_settime(entry, 0); } /* * Same as fuse_invalidate_entry_cache(), but also try to remove the * dentry from the hash */ static void fuse_invalidate_entry(struct dentry *entry) { d_invalidate(entry); fuse_invalidate_entry_cache(entry); } static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); args->opcode = FUSE_LOOKUP; args->nodeid = nodeid; args->in_numargs = 1; args->in_args[0].size = name->len + 1; args->in_args[0].value = name->name; args->out_numargs = 1; args->out_args[0].size = sizeof(struct fuse_entry_out); args->out_args[0].value = outarg; } /* * Check whether the dentry is still valid * * If the entry validity timeout has expired and the dentry is * positive, try to redo the lookup. If the lookup results in a * different inode, then let the VFS invalidate the dentry and redo * the lookup once more. If the lookup results in the same inode, * then refresh the attributes, timeouts and mark the dentry valid. */ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; struct dentry *parent; struct fuse_mount *fm; struct fuse_inode *fi; int ret; inode = d_inode_rcu(entry); if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; /* For negative dentries, always do a fresh lookup */ if (!inode) goto invalid; ret = -ECHILD; if (flags & LOOKUP_RCU) goto out; fm = get_fuse_mount(inode); forget = fuse_alloc_forget(); ret = -ENOMEM; if (!forget) goto out; attr_version = fuse_get_attr_version(fm->fc); parent = dget_parent(entry); fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); ret = fuse_simple_request(fm, &args); dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; if (!ret) { fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode) || (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); goto invalid; } spin_lock(&fi->lock); fi->nlookup++; spin_unlock(&fi->lock); } kfree(forget); if (ret == -ENOMEM || ret == -EINTR) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || fuse_stale_inode(inode, outarg.generation, &outarg.attr)) goto invalid; forget_all_cached_acls(inode); fuse_change_attributes(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); } else if (inode) { fi = get_fuse_inode(inode); if (flags & LOOKUP_RCU) { if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) return -ECHILD; } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { parent = dget_parent(entry); fuse_advise_use_readdirplus(d_inode(parent)); dput(parent); } } ret = 1; out: return ret; invalid: ret = 0; goto out; } #if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } static void fuse_dentry_release(struct dentry *dentry) { union fuse_dentry *fd = dentry->d_fsdata; kfree_rcu(fd, rcu); } #endif static int fuse_dentry_delete(const struct dentry *dentry) { return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); } /* * Create a fuse_mount object with a new superblock (with path->dentry * as the root), and return that mount so it can be auto-mounted on * @path. */ static struct vfsmount *fuse_dentry_automount(struct path *path) { struct fs_context *fsc; struct vfsmount *mnt; struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); if (IS_ERR(fsc)) return ERR_CAST(fsc); /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */ fsc->fs_private = mp_fi; /* Create the submount */ mnt = fc_mount(fsc); if (!IS_ERR(mnt)) mntget(mnt); put_fs_context(fsc); return mnt; } const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, #if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, #endif .d_automount = fuse_dentry_automount, }; const struct dentry_operations fuse_root_dentry_operations = { #if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, #endif }; int fuse_valid_type(int m) { return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } static bool fuse_valid_size(u64 size) { return size <= LLONG_MAX; } bool fuse_invalid_attr(struct fuse_attr *attr) { return !fuse_valid_type(attr->mode) || !fuse_valid_size(attr->size); } int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version, evict_ctr; int err; *inode = NULL; err = -ENAMETOOLONG; if (name->len > FUSE_NAME_MAX) goto out; forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) goto out; attr_version = fuse_get_attr_version(fm->fc); evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); err = fuse_simple_request(fm, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; err = -EIO; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; if (outarg->nodeid == FUSE_ROOT_ID && outarg->generation != 0) { pr_warn_once("root generation should be zero\n"); outarg->generation = 0; } *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), attr_version, evict_ctr); err = -ENOMEM; if (!*inode) { fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); goto out; } err = 0; out_put_forget: kfree(forget); out: return err; } static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { int err; struct fuse_entry_out outarg; struct inode *inode; struct dentry *newent; bool outarg_valid = true; bool locked; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); fuse_unlock_inode(dir, locked); if (err == -ENOENT) { outarg_valid = false; err = 0; } if (err) goto out_err; err = -EIO; if (inode && get_node_id(inode) == FUSE_ROOT_ID) goto out_iput; newent = d_splice_alias(inode, entry); err = PTR_ERR(newent); if (IS_ERR(newent)) goto out_err; entry = newent ? newent : entry; if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else fuse_invalidate_entry_cache(entry); if (inode) fuse_advise_use_readdirplus(dir); return newent; out_iput: iput(inode); out_err: return ERR_PTR(err); } static int get_security_context(struct dentry *entry, umode_t mode, struct fuse_in_arg *ext) { struct fuse_secctx *fctx; struct fuse_secctx_header *header; void *ctx = NULL, *ptr; u32 ctxlen, total_len = sizeof(*header); int err, nr_ctx = 0; const char *name; size_t namelen; err = security_dentry_init_security(entry, mode, &entry->d_name, &name, &ctx, &ctxlen); if (err) { if (err != -EOPNOTSUPP) goto out_err; /* No LSM is supporting this security hook. Ignore error */ ctxlen = 0; ctx = NULL; } if (ctxlen) { nr_ctx = 1; namelen = strlen(name) + 1; err = -EIO; if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX)) goto out_err; total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen); } err = -ENOMEM; header = ptr = kzalloc(total_len, GFP_KERNEL); if (!ptr) goto out_err; header->nr_secctx = nr_ctx; header->size = total_len; ptr += sizeof(*header); if (nr_ctx) { fctx = ptr; fctx->size = ctxlen; ptr += sizeof(*fctx); strcpy(ptr, name); ptr += namelen; memcpy(ptr, ctx, ctxlen); } ext->size = total_len; ext->value = header; err = 0; out_err: kfree(ctx); return err; } static void *extend_arg(struct fuse_in_arg *buf, u32 bytes) { void *p; u32 newlen = buf->size + bytes; p = krealloc(buf->value, newlen, GFP_KERNEL); if (!p) { kfree(buf->value); buf->size = 0; buf->value = NULL; return NULL; } memset(p + buf->size, 0, bytes); buf->value = p; buf->size = newlen; return p + newlen - bytes; } static u32 fuse_ext_size(size_t size) { return FUSE_REC_ALIGN(sizeof(struct fuse_ext_header) + size); } /* * This adds just a single supplementary group that matches the parent's group. */ static int get_create_supp_group(struct mnt_idmap *idmap, struct inode *dir, struct fuse_in_arg *ext) { struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_ext_header *xh; struct fuse_supp_groups *sg; kgid_t kgid = dir->i_gid; vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, kgid); gid_t parent_gid = from_kgid(fc->user_ns, kgid); u32 sg_len = fuse_ext_size(sizeof(*sg) + sizeof(sg->groups[0])); if (parent_gid == (gid_t) -1 || vfsgid_eq_kgid(vfsgid, current_fsgid()) || !vfsgid_in_group_p(vfsgid)) return 0; xh = extend_arg(ext, sg_len); if (!xh) return -ENOMEM; xh->size = sg_len; xh->type = FUSE_EXT_GROUPS; sg = (struct fuse_supp_groups *) &xh[1]; sg->nr_groups = 1; sg->groups[0] = parent_gid; return 0; } static int get_create_ext(struct mnt_idmap *idmap, struct fuse_args *args, struct inode *dir, struct dentry *dentry, umode_t mode) { struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); struct fuse_in_arg ext = { .size = 0, .value = NULL }; int err = 0; if (fc->init_security) err = get_security_context(dentry, mode, &ext); if (!err && fc->create_supp_group) err = get_create_supp_group(idmap, dir, &ext); if (!err && ext.size) { WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args)); args->is_ext = true; args->ext_idx = args->in_numargs++; args->in_args[args->ext_idx] = ext; } else { kfree(ext.value); } return err; } static void free_ext_value(struct fuse_args *args) { if (args->is_ext) kfree(args->in_args[args->ext_idx].value); } /* * Atomic create+open operation * * If the filesystem doesn't support this, then fall back to separate * 'mknod' + 'open' requests. */ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, umode_t mode, u32 opcode) { int err; struct inode *inode; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; struct fuse_open_out *outopenp; struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) goto out_err; err = -ENOMEM; ff = fuse_file_alloc(fm, true); if (!ff) goto out_put_forget_req; if (!fm->fc->dont_mask) mode &= ~current_umask(); flags &= ~O_NOCTTY; memset(&inarg, 0, sizeof(inarg)); memset(&outentry, 0, sizeof(outentry)); inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); if (fm->fc->handle_killpriv_v2 && trunc && !(flags & O_EXCL) && !capable(CAP_FSETID)) { inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } args.opcode = opcode; args.nodeid = get_node_id(dir); args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; args.out_numargs = 2; args.out_args[0].size = sizeof(outentry); args.out_args[0].value = &outentry; /* Store outarg for fuse_finish_open() */ outopenp = &ff->args->open_outarg; args.out_args[1].size = sizeof(*outopenp); args.out_args[1].value = outopenp; err = get_create_ext(idmap, &args, dir, entry, mode); if (err) goto out_free_ff; err = fuse_simple_idmap_request(idmap, fm, &args); free_ext_value(&args); if (err) goto out_free_ff; err = -EIO; if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr)) goto out_free_ff; ff->fh = outopenp->fh; ff->nodeid = outentry.nodeid; ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr, ATTR_TIMEOUT(&outentry), 0, 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); err = -ENOMEM; goto out_err; } kfree(forget); d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); err = generic_file_open(inode, file); if (!err) { file->private_data = ff; err = finish_open(file, entry, fuse_finish_open); } if (err) { fi = get_fuse_inode(inode); fuse_sync_release(fi, ff, flags); } else { if (fm->fc->atomic_o_trunc && trunc) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); } return err; out_free_ff: fuse_file_free(ff); out_put_forget_req: kfree(forget); out_err: return err; } static int fuse_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, umode_t mode) { int err; struct mnt_idmap *idmap = file_mnt_idmap(file); struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; if (fuse_is_bad(dir)) return -EIO; if (d_in_lookup(entry)) { res = fuse_lookup(dir, entry, 0); if (IS_ERR(res)) return PTR_ERR(res); if (res) entry = res; } if (!(flags & O_CREAT) || d_really_is_positive(entry)) goto no_open; /* Only creates */ file->f_mode |= FMODE_CREATED; if (fc->no_create) goto mknod; err = fuse_create_open(idmap, dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; } else if (err == -EEXIST) fuse_invalidate_entry(entry); out_dput: dput(res); return err; mknod: err = fuse_mknod(idmap, dir, entry, mode, 0); if (err) goto out_dput; no_open: return finish_no_open(file, res); } /* * Code shared between mknod, mkdir, symlink and link */ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, struct fuse_args *args, struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; struct dentry *d; int err; struct fuse_forget_link *forget; if (fuse_is_bad(dir)) return -EIO; forget = fuse_alloc_forget(); if (!forget) return -ENOMEM; memset(&outarg, 0, sizeof(outarg)); args->nodeid = get_node_id(dir); args->out_numargs = 1; args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; if (args->opcode != FUSE_LINK) { err = get_create_ext(idmap, args, dir, entry, mode); if (err) goto out_put_forget_req; } err = fuse_simple_idmap_request(idmap, fm, args); free_ext_value(args); if (err) goto out_put_forget_req; err = -EIO; if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr)) goto out_put_forget_req; if ((outarg.attr.mode ^ mode) & S_IFMT) goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; } kfree(forget); d_drop(entry); d = d_splice_alias(inode, entry); if (IS_ERR(d)) return PTR_ERR(d); if (d) { fuse_change_entry_timeout(d, &outarg); dput(d); } else { fuse_change_entry_timeout(entry, &outarg); } fuse_dir_changed(dir); return 0; out_put_forget_req: if (err == -EEXIST) fuse_invalidate_entry(entry); kfree(forget); return err; } static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.rdev = new_encode_dev(rdev); inarg.umask = current_umask(); args.opcode = FUSE_MKNOD; args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; return create_new_entry(idmap, fm, &args, dir, entry, mode); } static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, bool excl) { return fuse_mknod(idmap, dir, entry, mode, 0); } static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct fuse_conn *fc = get_fuse_conn(dir); int err; if (fc->no_tmpfile) return -EOPNOTSUPP; err = fuse_create_open(idmap, dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); if (err == -ENOSYS) { fc->no_tmpfile = 1; err = -EOPNOTSUPP; } return err; } static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.umask = current_umask(); args.opcode = FUSE_MKDIR; args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; return create_new_entry(idmap, fm, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, const char *link) { struct fuse_mount *fm = get_fuse_mount(dir); unsigned len = strlen(link) + 1; FUSE_ARGS(args); args.opcode = FUSE_SYMLINK; args.in_numargs = 2; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; args.in_args[1].size = len; args.in_args[1].value = link; return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK); } void fuse_flush_time_update(struct inode *inode) { int err = sync_inode_metadata(inode, 1); mapping_set_error(inode->i_mapping, err); } static void fuse_update_ctime_in_cache(struct inode *inode) { if (!IS_NOCMTIME(inode)) { inode_set_ctime_current(inode); mark_inode_dirty_sync(inode); fuse_flush_time_update(inode); } } void fuse_update_ctime(struct inode *inode) { fuse_invalidate_attr_mask(inode, STATX_CTIME); fuse_update_ctime_in_cache(inode); } static void fuse_entry_unlinked(struct dentry *entry) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); /* * If i_nlink == 0 then unlink doesn't make sense, yet this can * happen if userspace filesystem is careless. It would be * difficult to enforce correct nlink usage so just ignore this * condition here */ if (S_ISDIR(inode->i_mode)) clear_nlink(inode); else if (inode->i_nlink > 0) drop_nlink(inode); spin_unlock(&fi->lock); fuse_invalidate_entry_cache(entry); fuse_update_ctime(inode); } static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); if (fuse_is_bad(dir)) return -EIO; args.opcode = FUSE_UNLINK; args.nodeid = get_node_id(dir); args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); } else if (err == -EINTR || err == -ENOENT) fuse_invalidate_entry(entry); return err; } static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); if (fuse_is_bad(dir)) return -EIO; args.opcode = FUSE_RMDIR; args.nodeid = get_node_id(dir); args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); } else if (err == -EINTR || err == -ENOENT) fuse_invalidate_entry(entry); return err; } static int fuse_rename_common(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags, int opcode, size_t argsize) { int err; struct fuse_rename2_in inarg; struct fuse_mount *fm = get_fuse_mount(olddir); FUSE_ARGS(args); memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); inarg.flags = flags; args.opcode = opcode; args.nodeid = get_node_id(olddir); args.in_numargs = 3; args.in_args[0].size = argsize; args.in_args[0].value = &inarg; args.in_args[1].size = oldent->d_name.len + 1; args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; err = fuse_simple_idmap_request(idmap, fm, &args); if (!err) { /* ctime changes */ fuse_update_ctime(d_inode(oldent)); if (flags & RENAME_EXCHANGE) fuse_update_ctime(d_inode(newent)); fuse_dir_changed(olddir); if (olddir != newdir) fuse_dir_changed(newdir); /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) fuse_entry_unlinked(newent); } else if (err == -EINTR || err == -ENOENT) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation fails (e.g. some process has CWD under the renamed directory), then there can be inconsistency between the dcache and the real filesystem. Tough luck. */ fuse_invalidate_entry(oldent); if (d_really_is_positive(newent)) fuse_invalidate_entry(newent); } return err; } static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(olddir); int err; if (fuse_is_bad(olddir)) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags) { if (fc->no_rename2 || fc->minor < 23) return -EINVAL; err = fuse_rename_common((flags & RENAME_WHITEOUT) ? idmap : &invalid_mnt_idmap, olddir, oldent, newdir, newent, flags, FUSE_RENAME2, sizeof(struct fuse_rename2_in)); if (err == -ENOSYS) { fc->no_rename2 = 1; err = -EINVAL; } } else { err = fuse_rename_common(&invalid_mnt_idmap, olddir, oldent, newdir, newent, 0, FUSE_RENAME, sizeof(struct fuse_rename_in)); } return err; } static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { int err; struct fuse_link_in inarg; struct inode *inode = d_inode(entry); struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); args.opcode = FUSE_LINK; args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) fuse_invalidate_attr(inode); return err; } static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; struct fuse_conn *fc = get_fuse_conn(inode); vfsuid_t vfsuid = make_vfsuid(idmap, fc->user_ns, make_kuid(fc->user_ns, attr->uid)); vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, make_kgid(fc->user_ns, attr->gid)); stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; stat->uid = vfsuid_into_kuid(vfsuid); stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; stat->mtime.tv_sec = attr->mtime; stat->mtime.tv_nsec = attr->mtimensec; stat->ctime.tv_sec = attr->ctime; stat->ctime.tv_nsec = attr->ctimensec; stat->size = attr->size; stat->blocks = attr->blocks; if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else blkbits = inode->i_sb->s_blocksize_bits; stat->blksize = 1 << blkbits; } static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr) { memset(attr, 0, sizeof(*attr)); attr->ino = sx->ino; attr->size = sx->size; attr->blocks = sx->blocks; attr->atime = sx->atime.tv_sec; attr->mtime = sx->mtime.tv_sec; attr->ctime = sx->ctime.tv_sec; attr->atimensec = sx->atime.tv_nsec; attr->mtimensec = sx->mtime.tv_nsec; attr->ctimensec = sx->ctime.tv_nsec; attr->mode = sx->mode; attr->nlink = sx->nlink; attr->uid = sx->uid; attr->gid = sx->gid; attr->rdev = new_encode_dev(MKDEV(sx->rdev_major, sx->rdev_minor)); attr->blksize = sx->blksize; } static int fuse_do_statx(struct mnt_idmap *idmap, struct inode *inode, struct file *file, struct kstat *stat) { int err; struct fuse_attr attr; struct fuse_statx *sx; struct fuse_statx_in inarg; struct fuse_statx_out outarg; struct fuse_mount *fm = get_fuse_mount(inode); u64 attr_version = fuse_get_attr_version(fm->fc); FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); /* Directories have separate file-handle space */ if (file && S_ISREG(inode->i_mode)) { struct fuse_file *ff = file->private_data; inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } /* For now leave sync hints as the default, request all stats. */ inarg.sx_flags = 0; inarg.sx_mask = STATX_BASIC_STATS | STATX_BTIME; args.opcode = FUSE_STATX; args.nodeid = get_node_id(inode); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; err = fuse_simple_request(fm, &args); if (err) return err; sx = &outarg.stat; if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) || ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) || inode_wrong_type(inode, sx->mode)))) { fuse_make_bad(inode); return -EIO; } fuse_statx_to_attr(&outarg.stat, &attr); if ((sx->mask & STATX_BASIC_STATS) == STATX_BASIC_STATS) { fuse_change_attributes(inode, &attr, &outarg.stat, ATTR_TIMEOUT(&outarg), attr_version); } if (stat) { stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); stat->btime.tv_sec = sx->btime.tv_sec; stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); fuse_fillattr(idmap, inode, &attr, stat); stat->result_mask |= STATX_TYPE; } return 0; } static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, struct kstat *stat, struct file *file) { int err; struct fuse_getattr_in inarg; struct fuse_attr_out outarg; struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); u64 attr_version; attr_version = fuse_get_attr_version(fm->fc); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); /* Directories have separate file-handle space */ if (file && S_ISREG(inode->i_mode)) { struct fuse_file *ff = file->private_data; inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } args.opcode = FUSE_GETATTR; args.nodeid = get_node_id(inode); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || inode_wrong_type(inode, outarg.attr.mode)) { fuse_make_bad(inode); err = -EIO; } else { fuse_change_attributes(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), attr_version); if (stat) fuse_fillattr(idmap, inode, &outarg.attr, stat); } } return err; } static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode, struct file *file, struct kstat *stat, u32 request_mask, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); int err = 0; bool sync; u32 inval_mask = READ_ONCE(fi->inval_mask); u32 cache_mask = fuse_get_cache_mask(inode); /* FUSE only supports basic stats and possibly btime */ request_mask &= STATX_BASIC_STATS | STATX_BTIME; retry: if (fc->no_statx) request_mask &= STATX_BASIC_STATS; if (!request_mask) sync = false; else if (flags & AT_STATX_FORCE_SYNC) sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; else if (request_mask & inval_mask & ~cache_mask) sync = true; else sync = time_before64(fi->i_time, get_jiffies_64()); if (sync) { forget_all_cached_acls(inode); /* Try statx if BTIME is requested */ if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) { err = fuse_do_statx(idmap, inode, file, stat); if (err == -ENOSYS) { fc->no_statx = 1; err = 0; goto retry; } } else { err = fuse_do_getattr(idmap, inode, stat, file); } } else if (stat) { generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; if (test_bit(FUSE_I_BTIME, &fi->state)) { stat->btime = fi->i_btime; stat->result_mask |= STATX_BTIME; } } return err; } int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) { return fuse_update_get_attr(&nop_mnt_idmap, inode, file, NULL, mask, 0); } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name, u32 flags) { int err = -ENOTDIR; struct inode *parent; struct dentry *dir; struct dentry *entry; parent = fuse_ilookup(fc, parent_nodeid, NULL); if (!parent) return -ENOENT; inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) goto unlock; err = -ENOENT; dir = d_find_alias(parent); if (!dir) goto unlock; name->hash = full_name_hash(dir, name->name, name->len); entry = d_lookup(dir, name); dput(dir); if (!entry) goto unlock; fuse_dir_changed(parent); if (!(flags & FUSE_EXPIRE_ONLY)) d_invalidate(entry); fuse_invalidate_entry_cache(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; } if (d_mountpoint(entry)) { err = -EBUSY; goto badentry; } if (d_is_dir(entry)) { shrink_dcache_parent(entry); if (!simple_empty(entry)) { err = -ENOTEMPTY; goto badentry; } d_inode(entry)->i_flags |= S_DEAD; } dont_mount(entry); clear_nlink(d_inode(entry)); err = 0; badentry: inode_unlock(d_inode(entry)); if (!err) d_delete(entry); } else { err = 0; } dput(entry); unlock: inode_unlock(parent); iput(parent); return err; } static inline bool fuse_permissible_uidgid(struct fuse_conn *fc) { const struct cred *cred = current_cred(); return (uid_eq(cred->euid, fc->user_id) && uid_eq(cred->suid, fc->user_id) && uid_eq(cred->uid, fc->user_id) && gid_eq(cred->egid, fc->group_id) && gid_eq(cred->sgid, fc->group_id) && gid_eq(cred->gid, fc->group_id)); } /* * Calling into a user-controlled filesystem gives the filesystem * daemon ptrace-like capabilities over the current process. This * means, that the filesystem daemon is able to record the exact * filesystem operations performed, and can also control the behavior * of the requester process in otherwise impossible ways. For example * it can delay the operation for arbitrary length of time allowing * DoS against the requester. * * For this reason only those processes can call into the filesystem, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ bool fuse_allow_current_process(struct fuse_conn *fc) { bool allow; if (fc->allow_other) allow = current_in_userns(fc->user_ns); else allow = fuse_permissible_uidgid(fc); if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN)) allow = true; return allow; } static int fuse_access(struct inode *inode, int mask) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_access_in inarg; int err; BUG_ON(mask & MAY_NOT_BLOCK); /* * We should not send FUSE_ACCESS to the userspace * when idmapped mounts are enabled as for this case * we have fc->default_permissions = 1 and access * permission checks are done on the kernel side. */ WARN_ON_ONCE(!(fm->sb->s_iflags & SB_I_NOIDMAP)); if (fm->fc->no_access) return 0; memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); args.opcode = FUSE_ACCESS; args.nodeid = get_node_id(inode); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_access = 1; err = 0; } return err; } static int fuse_perm_getattr(struct inode *inode, int mask) { if (mask & MAY_NOT_BLOCK) return -ECHILD; forget_all_cached_acls(inode); return fuse_do_getattr(&nop_mnt_idmap, inode, NULL, NULL); } /* * Check permission. The two basic access models of FUSE are: * * 1) Local access checking ('default_permissions' mount option) based * on file mode. This is the plain old disk filesystem permission * model. * * 2) "Remote" access checking, where server is responsible for * checking permission in each inode operation. An exception to this * is if ->permission() was invoked from sys_access() in which case an * access request is sent. Execute permission is still checked * locally based on file mode. */ static int fuse_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; int err = 0; if (fuse_is_bad(inode)) return -EIO; if (!fuse_allow_current_process(fc)) return -EACCES; /* * If attributes are needed, refresh them before proceeding */ if (fc->default_permissions || ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); u32 perm_mask = STATX_MODE | STATX_UID | STATX_GID; if (perm_mask & READ_ONCE(fi->inval_mask) || time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); if (err) return err; } } if (fc->default_permissions) { err = generic_permission(idmap, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root node will at first have no permissions */ if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) err = generic_permission(idmap, inode, mask); } /* Note: the opposite of the above test does not exist. So if permissions are revoked this won't be noticed immediately, only after the attribute timeout has expired */ } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { err = fuse_access(inode, mask); } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { if (!(inode->i_mode & S_IXUGO)) { if (refreshed) return -EACCES; err = fuse_perm_getattr(inode, mask); if (!err && !(inode->i_mode & S_IXUGO)) return -EACCES; } } return err; } static int fuse_readlink_page(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { .num_folios = 1, .folios = &folio, .descs = &desc, }; char *link; ssize_t res; ap.args.opcode = FUSE_READLINK; ap.args.nodeid = get_node_id(inode); ap.args.out_pages = true; ap.args.out_argvar = true; ap.args.page_zeroing = true; ap.args.out_numargs = 1; ap.args.out_args[0].size = desc.length; res = fuse_simple_request(fm, &ap.args); fuse_invalidate_atime(inode); if (res < 0) return res; if (WARN_ON(res >= PAGE_SIZE)) return -EIO; link = folio_address(folio); link[res] = '\0'; return 0; } static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct fuse_conn *fc = get_fuse_conn(inode); struct folio *folio; int err; err = -EIO; if (fuse_is_bad(inode)) goto out_err; if (fc->cache_symlinks) return page_get_link(dentry, inode, callback); err = -ECHILD; if (!dentry) goto out_err; folio = folio_alloc(GFP_KERNEL, 0); err = -ENOMEM; if (!folio) goto out_err; err = fuse_readlink_page(inode, folio); if (err) { folio_put(folio); goto out_err; } set_delayed_call(callback, page_put_link, &folio->page); return folio_address(folio); out_err: return ERR_PTR(err); } static int fuse_dir_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); int err; if (fuse_is_bad(inode)) return -EIO; err = generic_file_open(inode, file); if (err) return err; err = fuse_do_open(fm, get_node_id(inode), file, true); if (!err) { struct fuse_file *ff = file->private_data; /* * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for * directories for backward compatibility, though it's unlikely * to be useful. */ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) nonseekable_open(inode, file); } return err; } static int fuse_dir_release(struct inode *inode, struct file *file) { fuse_release_common(file, true); return 0; } static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); int err; if (fuse_is_bad(inode)) return -EIO; if (fc->no_fsyncdir) return 0; inode_lock(inode); err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNCDIR); if (err == -ENOSYS) { fc->no_fsyncdir = 1; err = 0; } inode_unlock(inode); return err; } static long fuse_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */ if (fc->minor < 18) return -ENOTTY; return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR); } static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); if (fc->minor < 18) return -ENOTTY; return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); } static bool update_mtime(unsigned ivalid, bool trust_local_mtime) { /* Always update if mtime is explicitly set */ if (ivalid & ATTR_MTIME_SET) return true; /* Or if kernel i_mtime is the official one */ if (trust_local_mtime) return true; /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) return false; /* In all other cases update */ return true; } static void iattr_to_fattr(struct mnt_idmap *idmap, struct fuse_conn *fc, struct iattr *iattr, struct fuse_setattr_in *arg, bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; if (ivalid & ATTR_UID) { kuid_t fsuid = from_vfsuid(idmap, fc->user_ns, iattr->ia_vfsuid); arg->valid |= FATTR_UID; arg->uid = from_kuid(fc->user_ns, fsuid); } if (ivalid & ATTR_GID) { kgid_t fsgid = from_vfsgid(idmap, fc->user_ns, iattr->ia_vfsgid); arg->valid |= FATTR_GID; arg->gid = from_kgid(fc->user_ns, fsgid); } if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { arg->valid |= FATTR_ATIME; arg->atime = iattr->ia_atime.tv_sec; arg->atimensec = iattr->ia_atime.tv_nsec; if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) arg->valid |= FATTR_MTIME_NOW; } if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { arg->valid |= FATTR_CTIME; arg->ctime = iattr->ia_ctime.tv_sec; arg->ctimensec = iattr->ia_ctime.tv_nsec; } } /* * Prevent concurrent writepages on inode * * This is done by adding a negative bias to the inode write counter * and waiting for all pending writes to finish. */ void fuse_set_nowrite(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(!inode_is_locked(inode)); spin_lock(&fi->lock); BUG_ON(fi->writectr < 0); fi->writectr += FUSE_NOWRITE; spin_unlock(&fi->lock); wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); } /* * Allow writepages on inode * * Remove the bias from the writecounter and send any queued * writepages. */ static void __fuse_release_nowrite(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(fi->writectr != FUSE_NOWRITE); fi->writectr = 0; fuse_flush_writepages(inode); } void fuse_release_nowrite(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); __fuse_release_nowrite(inode); spin_unlock(&fi->lock); } static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, struct inode *inode, struct fuse_setattr_in *inarg_p, struct fuse_attr_out *outarg_p) { args->opcode = FUSE_SETATTR; args->nodeid = get_node_id(inode); args->in_numargs = 1; args->in_args[0].size = sizeof(*inarg_p); args->in_args[0].value = inarg_p; args->out_numargs = 1; args->out_args[0].size = sizeof(*outarg_p); args->out_args[0].value = outarg_p; } /* * Flush inode->i_mtime to the server */ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); inarg.valid = FATTR_MTIME; inarg.mtime = inode_get_mtime_sec(inode); inarg.mtimensec = inode_get_mtime_nsec(inode); if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; inarg.ctime = inode_get_ctime_sec(inode); inarg.ctimensec = inode_get_ctime_nsec(inode); } if (ff) { inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg); return fuse_simple_request(fm, &args); } /* * Set attributes, and at the same time refresh them. * * Truncation is slightly complicated, because the 'truncate' request * may fail, in which case we don't want to touch the mapping. * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); struct address_space *mapping = inode->i_mapping; FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode); loff_t oldsize; int err; bool trust_local_cmtime = is_wb; bool fault_blocked = false; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; err = setattr_prepare(idmap, dentry, attr); if (err) return err; if (attr->ia_valid & ATTR_SIZE) { if (WARN_ON(!S_ISREG(inode->i_mode))) return -EIO; is_truncate = true; } if (FUSE_IS_DAX(inode) && is_truncate) { filemap_invalidate_lock(mapping); fault_blocked = true; err = fuse_dax_break_layouts(inode, 0, 0); if (err) { filemap_invalidate_unlock(mapping); return err; } } if (attr->ia_valid & ATTR_OPEN) { /* This is coming from open(..., ... | O_TRUNC); */ WARN_ON(!(attr->ia_valid & ATTR_SIZE)); WARN_ON(attr->ia_size != 0); if (fc->atomic_o_trunc) { /* * No need to send request to userspace, since actual * truncation has already been done by OPEN. But still * need to truncate page cache. */ i_size_write(inode, 0); truncate_pagecache(inode, 0); goto out; } file = NULL; } /* Flush dirty data/metadata before non-truncate SETATTR */ if (is_wb && attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET | ATTR_TIMES_SET)) { err = write_inode_now(inode, true); if (err) return err; fuse_set_nowrite(inode); fuse_release_nowrite(inode); } if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (trust_local_cmtime && attr->ia_size != inode->i_size) attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); iattr_to_fattr(idmap, fc, attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } /* Kill suid/sgid for non-directory chown unconditionally */ if (fc->handle_killpriv_v2 && !S_ISDIR(inode->i_mode) && attr->ia_valid & (ATTR_UID | ATTR_GID)) inarg.valid |= FATTR_KILL_SUIDGID; if (attr->ia_valid & ATTR_SIZE) { /* For mandatory locking in truncate */ inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); /* Kill suid/sgid for truncate only if no CAP_FSETID */ if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) inarg.valid |= FATTR_KILL_SUIDGID; } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fm, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); goto error; } if (fuse_invalid_attr(&outarg.attr) || inode_wrong_type(inode, outarg.attr.mode)) { fuse_make_bad(inode); err = -EIO; goto error; } spin_lock(&fi->lock); /* the kernel maintains i_mtime locally */ if (trust_local_cmtime) { if (attr->ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, attr->ia_mtime); if (attr->ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(inode, attr->ia_ctime); /* FIXME: clear I_DIRTY_SYNC? */ } fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode), 0); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ if (!is_wb || is_truncate) i_size_write(inode, outarg.attr.size); if (is_truncate) { /* NOTE: this may release/reacquire fi->lock */ __fuse_release_nowrite(inode); } spin_unlock(&fi->lock); /* * Only call invalidate_inode_pages2() after removing * FUSE_NOWRITE, otherwise fuse_launder_folio() would deadlock. */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(mapping); } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); out: if (fault_blocked) filemap_invalidate_unlock(mapping); return 0; error: if (is_truncate) fuse_release_nowrite(inode); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (fault_blocked) filemap_invalidate_unlock(mapping); return err; } static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; int ret; if (fuse_is_bad(inode)) return -EIO; if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) { attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE); /* * The only sane way to reliably kill suid/sgid is to do it in * the userspace filesystem * * This should be done on write(), truncate() and chown(). */ if (!fc->handle_killpriv && !fc->handle_killpriv_v2) { /* * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. */ ret = fuse_do_getattr(idmap, inode, NULL, file); if (ret) return ret; attr->ia_mode = inode->i_mode; if (inode->i_mode & S_ISUID) { attr->ia_valid |= ATTR_MODE; attr->ia_mode &= ~S_ISUID; } if ((inode->i_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { attr->ia_valid |= ATTR_MODE; attr->ia_mode &= ~S_ISGID; } } } if (!attr->ia_valid) return 0; ret = fuse_do_setattr(idmap, entry, attr, file); if (!ret) { /* * If filesystem supports acls it may have updated acl xattrs in * the filesystem, so forget cached acls for the inode. */ if (fc->posix_acl) forget_all_cached_acls(inode); /* Directory mode changed, may need to revalidate access */ if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE)) fuse_invalidate_entry_cache(entry); } return ret; } static int fuse_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct fuse_conn *fc = get_fuse_conn(inode); if (fuse_is_bad(inode)) return -EIO; if (!fuse_allow_current_process(fc)) { if (!request_mask) { /* * If user explicitly requested *nothing* then don't * error out, but return st_dev only. */ stat->result_mask = 0; stat->dev = inode->i_sb->s_dev; return 0; } return -EACCES; } return fuse_update_get_attr(idmap, inode, NULL, stat, request_mask, flags); } static const struct inode_operations fuse_dir_inode_operations = { .lookup = fuse_lookup, .mkdir = fuse_mkdir, .symlink = fuse_symlink, .unlink = fuse_unlink, .rmdir = fuse_rmdir, .rename = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, .atomic_open = fuse_atomic_open, .tmpfile = fuse_tmpfile, .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, .get_inode_acl = fuse_get_inode_acl, .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, }; static const struct file_operations fuse_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = fuse_readdir, .open = fuse_dir_open, .release = fuse_dir_release, .fsync = fuse_dir_fsync, .unlocked_ioctl = fuse_dir_ioctl, .compat_ioctl = fuse_dir_compat_ioctl, }; static const struct inode_operations fuse_common_inode_operations = { .setattr = fuse_setattr, .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, .get_inode_acl = fuse_get_inode_acl, .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, }; static const struct inode_operations fuse_symlink_inode_operations = { .setattr = fuse_setattr, .get_link = fuse_get_link, .getattr = fuse_getattr, .listxattr = fuse_listxattr, }; void fuse_init_common(struct inode *inode) { inode->i_op = &fuse_common_inode_operations; } void fuse_init_dir(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); inode->i_op = &fuse_dir_inode_operations; inode->i_fop = &fuse_dir_operations; spin_lock_init(&fi->rdc.lock); fi->rdc.cached = false; fi->rdc.size = 0; fi->rdc.pos = 0; fi->rdc.version = 0; } static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { int err = fuse_readlink_page(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); folio_unlock(folio); return err; } static const struct address_space_operations fuse_symlink_aops = { .read_folio = fuse_symlink_read_folio, }; void fuse_init_symlink(struct inode *inode) { inode->i_op = &fuse_symlink_inode_operations; inode->i_data.a_ops = &fuse_symlink_aops; inode_nohighmem(inode); }
2 13 2 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 // SPDX-License-Identifier: GPL-2.0-only /* * Suspend support specific for i386/x86-64. * * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl> * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> */ #include <linux/suspend.h> #include <linux/export.h> #include <linux/smp.h> #include <linux/perf_event.h> #include <linux/tboot.h> #include <linux/dmi.h> #include <linux/pgtable.h> #include <asm/proto.h> #include <asm/mtrr.h> #include <asm/page.h> #include <asm/mce.h> #include <asm/suspend.h> #include <asm/fpu/api.h> #include <asm/debugreg.h> #include <asm/cpu.h> #include <asm/cacheinfo.h> #include <asm/mmu_context.h> #include <asm/cpu_device_id.h> #include <asm/microcode.h> #ifdef CONFIG_X86_32 __visible unsigned long saved_context_ebx; __visible unsigned long saved_context_esp, saved_context_ebp; __visible unsigned long saved_context_esi, saved_context_edi; __visible unsigned long saved_context_eflags; #endif struct saved_context saved_context; static void msr_save_context(struct saved_context *ctxt) { struct saved_msr *msr = ctxt->saved_msrs.array; struct saved_msr *end = msr + ctxt->saved_msrs.num; while (msr < end) { if (msr->valid) rdmsrl(msr->info.msr_no, msr->info.reg.q); msr++; } } static void msr_restore_context(struct saved_context *ctxt) { struct saved_msr *msr = ctxt->saved_msrs.array; struct saved_msr *end = msr + ctxt->saved_msrs.num; while (msr < end) { if (msr->valid) wrmsrl(msr->info.msr_no, msr->info.reg.q); msr++; } } /** * __save_processor_state() - Save CPU registers before creating a * hibernation image and before restoring * the memory state from it * @ctxt: Structure to store the registers contents in. * * NOTE: If there is a CPU register the modification of which by the * boot kernel (ie. the kernel used for loading the hibernation image) * might affect the operations of the restored target kernel (ie. the one * saved in the hibernation image), then its contents must be saved by this * function. In other words, if kernel A is hibernated and different * kernel B is used for loading the hibernation image into memory, the * kernel A's __save_processor_state() function must save all registers * needed by kernel A, so that it can operate correctly after the resume * regardless of what kernel B does in the meantime. */ static void __save_processor_state(struct saved_context *ctxt) { #ifdef CONFIG_X86_32 mtrr_save_fixed_ranges(NULL); #endif kernel_fpu_begin(); /* * descriptor tables */ store_idt(&ctxt->idt); /* * We save it here, but restore it only in the hibernate case. * For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit * mode in "secondary_startup_64". In 32-bit mode it is done via * 'pmode_gdt' in wakeup_start. */ ctxt->gdt_desc.size = GDT_SIZE - 1; ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id()); store_tr(ctxt->tr); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ /* * segment registers */ savesegment(gs, ctxt->gs); #ifdef CONFIG_X86_64 savesegment(fs, ctxt->fs); savesegment(ds, ctxt->ds); savesegment(es, ctxt->es); rdmsrl(MSR_FS_BASE, ctxt->fs_base); rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); mtrr_save_fixed_ranges(NULL); rdmsrl(MSR_EFER, ctxt->efer); #endif /* * control registers */ ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); ctxt->cr3 = __read_cr3(); ctxt->cr4 = __read_cr4(); ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE, &ctxt->misc_enable); msr_save_context(ctxt); } /* Needed by apm.c */ void save_processor_state(void) { __save_processor_state(&saved_context); x86_platform.save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); #endif static void do_fpu_end(void) { /* * Restore FPU regs if necessary. */ kernel_fpu_end(); } static void fix_processor_context(void) { int cpu = smp_processor_id(); #ifdef CONFIG_X86_64 struct desc_struct *desc = get_cpu_gdt_rw(cpu); tss_desc tss; #endif /* * We need to reload TR, which requires that we change the * GDT entry to indicate "available" first. * * XXX: This could probably all be replaced by a call to * force_reload_TR(). */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); #ifdef CONFIG_X86_64 memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); tss.type = 0x9; /* The available 64-bit TSS (see AMD vol 2, pg 91 */ write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); syscall_init(); /* This sets MSR_*STAR and related */ #else if (boot_cpu_has(X86_FEATURE_SEP)) enable_sep_cpu(); #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ initialize_tlbstate_and_flush(); fpu__resume_cpu(); /* The processor is back on the direct GDT, load back the fixmap */ load_fixmap_gdt(cpu); } /** * __restore_processor_state() - Restore the contents of CPU registers saved * by __save_processor_state() * @ctxt: Structure to load the registers contents from. * * The asm code that gets us here will have restored a usable GDT, although * it will be pointing to the wrong alias. */ static void notrace __restore_processor_state(struct saved_context *ctxt) { struct cpuinfo_x86 *c; if (ctxt->misc_enable_saved) wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable); /* * control registers */ /* cr4 was introduced in the Pentium CPU */ #ifdef CONFIG_X86_32 if (ctxt->cr4) __write_cr4(ctxt->cr4); #else /* CONFIG X86_64 */ wrmsrl(MSR_EFER, ctxt->efer); __write_cr4(ctxt->cr4); #endif write_cr3(ctxt->cr3); write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); /* Restore the IDT. */ load_idt(&ctxt->idt); /* * Just in case the asm code got us here with the SS, DS, or ES * out of sync with the GDT, update them. */ loadsegment(ss, __KERNEL_DS); loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); /* * Restore percpu access. Percpu access can happen in exception * handlers or in complicated helpers like load_gs_index(). */ #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); #else loadsegment(fs, __KERNEL_PERCPU); #endif /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */ fix_processor_context(); /* * Now that we have descriptor tables fully restored and working * exception handling, restore the usermode segments. */ #ifdef CONFIG_X86_64 loadsegment(ds, ctxt->es); loadsegment(es, ctxt->es); loadsegment(fs, ctxt->fs); load_gs_index(ctxt->gs); /* * Restore FSBASE and GSBASE after restoring the selectors, since * restoring the selectors clobbers the bases. Keep in mind * that MSR_KERNEL_GS_BASE is horribly misnamed. */ wrmsrl(MSR_FS_BASE, ctxt->fs_base); wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); #else loadsegment(gs, ctxt->gs); #endif do_fpu_end(); tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); cache_bp_restore(); perf_restore_debug_store(); c = &cpu_data(smp_processor_id()); if (cpu_has(c, X86_FEATURE_MSR_IA32_FEAT_CTL)) init_ia32_feat_ctl(c); microcode_bsp_resume(); /* * This needs to happen after the microcode has been updated upon resume * because some of the MSRs are "emulated" in microcode. */ msr_restore_context(ctxt); } /* Needed by apm.c */ void notrace restore_processor_state(void) { __restore_processor_state(&saved_context); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); #endif #if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU) static void __noreturn resume_play_dead(void) { play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); hlt_play_dead(); } int hibernate_resume_nonboot_cpu_disable(void) { void (*play_dead)(void) = smp_ops.play_dead; int ret; /* * Ensure that MONITOR/MWAIT will not be used in the "play dead" loop * during hibernate image restoration, because it is likely that the * monitored address will be actually written to at that time and then * the "dead" CPU will attempt to execute instructions again, but the * address in its instruction pointer may not be possible to resolve * any more at that point (the page tables used by it previously may * have been overwritten by hibernate image data). * * First, make sure that we wake up all the potentially disabled SMT * threads which have been initially brought up and then put into * mwait/cpuidle sleep. * Those will be put to proper (not interfering with hibernation * resume) sleep afterwards, and the resumed kernel will decide itself * what to do with them. */ ret = cpuhp_smt_enable(); if (ret) return ret; smp_ops.play_dead = resume_play_dead; ret = freeze_secondary_cpus(0); smp_ops.play_dead = play_dead; return ret; } #endif /* * When bsp_check() is called in hibernate and suspend, cpu hotplug * is disabled already. So it's unnecessary to handle race condition between * cpumask query and cpu hotplug. */ static int bsp_check(void) { if (cpumask_first(cpu_online_mask) != 0) { pr_warn("CPU0 is offline.\n"); return -ENODEV; } return 0; } static int bsp_pm_callback(struct notifier_block *nb, unsigned long action, void *ptr) { int ret = 0; switch (action) { case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: ret = bsp_check(); break; default: break; } return notifier_from_errno(ret); } static int __init bsp_pm_check_init(void) { /* * Set this bsp_pm_callback as lower priority than * cpu_hotplug_pm_callback. So cpu_hotplug_pm_callback will be called * earlier to disable cpu hotplug before bsp online check. */ pm_notifier(bsp_pm_callback, -INT_MAX); return 0; } core_initcall(bsp_pm_check_init); static int msr_build_context(const u32 *msr_id, const int num) { struct saved_msrs *saved_msrs = &saved_context.saved_msrs; struct saved_msr *msr_array; int total_num; int i, j; total_num = saved_msrs->num + num; msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL); if (!msr_array) { pr_err("x86/pm: Can not allocate memory to save/restore MSRs during suspend.\n"); return -ENOMEM; } if (saved_msrs->array) { /* * Multiple callbacks can invoke this function, so copy any * MSR save requests from previous invocations. */ memcpy(msr_array, saved_msrs->array, sizeof(struct saved_msr) * saved_msrs->num); kfree(saved_msrs->array); } for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) { u64 dummy; msr_array[i].info.msr_no = msr_id[j]; msr_array[i].valid = !rdmsrl_safe(msr_id[j], &dummy); msr_array[i].info.reg.q = 0; } saved_msrs->num = total_num; saved_msrs->array = msr_array; return 0; } /* * The following sections are a quirk framework for problematic BIOSen: * Sometimes MSRs are modified by the BIOSen after suspended to * RAM, this might cause unexpected behavior after wakeup. * Thus we save/restore these specified MSRs across suspend/resume * in order to work around it. * * For any further problematic BIOSen/platforms, * please add your own function similar to msr_initialize_bdw. */ static int msr_initialize_bdw(const struct dmi_system_id *d) { /* Add any extra MSR ids into this array. */ u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL }; pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident); return msr_build_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id)); } static const struct dmi_system_id msr_save_dmi_table[] = { { .callback = msr_initialize_bdw, .ident = "BROADWELL BDX_EP", .matches = { DMI_MATCH(DMI_PRODUCT_NAME, "GRANTLEY"), DMI_MATCH(DMI_PRODUCT_VERSION, "E63448-400"), }, }, {} }; static int msr_save_cpuid_features(const struct x86_cpu_id *c) { u32 cpuid_msr_id[] = { MSR_AMD64_CPUID_FN_1, }; pr_info("x86/pm: family %#hx cpu detected, MSR saving is needed during suspending.\n", c->family); return msr_build_context(cpuid_msr_id, ARRAY_SIZE(cpuid_msr_id)); } static const struct x86_cpu_id msr_save_cpu_table[] = { X86_MATCH_VENDOR_FAM(AMD, 0x15, &msr_save_cpuid_features), X86_MATCH_VENDOR_FAM(AMD, 0x16, &msr_save_cpuid_features), {} }; typedef int (*pm_cpu_match_t)(const struct x86_cpu_id *); static int pm_cpu_check(const struct x86_cpu_id *c) { const struct x86_cpu_id *m; int ret = 0; m = x86_match_cpu(msr_save_cpu_table); if (m) { pm_cpu_match_t fn; fn = (pm_cpu_match_t)m->driver_data; ret = fn(m); } return ret; } static void pm_save_spec_msr(void) { struct msr_enumeration { u32 msr_no; u32 feature; } msr_enum[] = { { MSR_IA32_SPEC_CTRL, X86_FEATURE_MSR_SPEC_CTRL }, { MSR_IA32_TSX_CTRL, X86_FEATURE_MSR_TSX_CTRL }, { MSR_TSX_FORCE_ABORT, X86_FEATURE_TSX_FORCE_ABORT }, { MSR_IA32_MCU_OPT_CTRL, X86_FEATURE_SRBDS_CTRL }, { MSR_AMD64_LS_CFG, X86_FEATURE_LS_CFG_SSBD }, { MSR_AMD64_DE_CFG, X86_FEATURE_LFENCE_RDTSC }, }; int i; for (i = 0; i < ARRAY_SIZE(msr_enum); i++) { if (boot_cpu_has(msr_enum[i].feature)) msr_build_context(&msr_enum[i].msr_no, 1); } } static int pm_check_save_msr(void) { dmi_check_system(msr_save_dmi_table); pm_cpu_check(msr_save_cpu_table); pm_save_spec_msr(); return 0; } device_initcall(pm_check_save_msr);
4944 13116 9758 115 382 11 2116 5734 522 16511 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BIT_SPINLOCK_H #define __LINUX_BIT_SPINLOCK_H #include <linux/kernel.h> #include <linux/preempt.h> #include <linux/atomic.h> #include <linux/bug.h> /* * bit-based spin_lock() * * Don't use this unless you really need to: spin_lock() and spin_unlock() * are significantly faster. */ static inline void bit_spin_lock(int bitnum, unsigned long *addr) { /* * Assuming the lock is uncontended, this never enters * the body of the outer loop. If it is contended, then * within the inner loop a non-atomic test is used to * busywait with less bus contention for a good time to * attempt to acquire the lock bit. */ preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) while (unlikely(test_and_set_bit_lock(bitnum, addr))) { preempt_enable(); do { cpu_relax(); } while (test_bit(bitnum, addr)); preempt_disable(); } #endif __acquire(bitlock); } /* * Return true if it was acquired */ static inline int bit_spin_trylock(int bitnum, unsigned long *addr) { preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) if (unlikely(test_and_set_bit_lock(bitnum, addr))) { preempt_enable(); return 0; } #endif __acquire(bitlock); return 1; } /* * bit-based spin_unlock() */ static inline void bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); } /* * bit-based spin_unlock() * non-atomic version, which can be used eg. if the bit lock itself is * protecting the rest of the flags in the word. */ static inline void __bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) __clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); } /* * Return true if the lock is held. */ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) { #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) return test_bit(bitnum, addr); #elif defined CONFIG_PREEMPT_COUNT return preempt_count(); #else return 1; #endif } #endif /* __LINUX_BIT_SPINLOCK_H */
161 49 93 93 93 92 93 93 161 160 56 160 160 159 4 4 4 160 158 146 145 149 160 146 111 145 56 56 53 160 159 51 51 49 161 161 45 44 45 47 6 6 6 6 6 5 6 49 49 49 2 49 49 47 47 2 47 47 2 45 45 45 45 2 45 49 43 36 43 43 10 10 10 10 1 10 10 10 2 2 2 2 2 2 2 13 13 12 2 9 13 93 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 // SPDX-License-Identifier: GPL-2.0+ /* * NILFS dat/inode allocator * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * * Originally written by Koji Sato. * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji. */ #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/bitops.h> #include <linux/slab.h> #include "mdt.h" #include "alloc.h" /** * nilfs_palloc_groups_per_desc_block - get the number of groups that a group * descriptor block can maintain * @inode: inode of metadata file using this allocator */ static inline unsigned long nilfs_palloc_groups_per_desc_block(const struct inode *inode) { return i_blocksize(inode) / sizeof(struct nilfs_palloc_group_desc); } /** * nilfs_palloc_groups_count - get maximum number of groups * @inode: inode of metadata file using this allocator */ static inline unsigned long nilfs_palloc_groups_count(const struct inode *inode) { return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); } /** * nilfs_palloc_init_blockgroup - initialize private variables for allocator * @inode: inode of metadata file using this allocator * @entry_size: size of the persistent object */ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS); if (!mi->mi_bgl) return -ENOMEM; bgl_lock_init(mi->mi_bgl); nilfs_mdt_set_entry_size(inode, entry_size, 0); mi->mi_blocks_per_group = DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode), mi->mi_entries_per_block) + 1; /* * Number of blocks in a group including entry blocks * and a bitmap block */ mi->mi_blocks_per_desc_block = nilfs_palloc_groups_per_desc_block(inode) * mi->mi_blocks_per_group + 1; /* * Number of blocks per descriptor including the * descriptor block */ return 0; } /** * nilfs_palloc_group - get group number and offset from an entry number * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) * @offset: pointer to store offset number in the group */ static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, unsigned long *offset) { __u64 group = nr; *offset = do_div(group, nilfs_palloc_entries_per_group(inode)); return group; } /** * nilfs_palloc_desc_blkoff - get block offset of a group descriptor block * @inode: inode of metadata file using this allocator * @group: group number * * nilfs_palloc_desc_blkoff() returns block offset of the descriptor * block which contains a descriptor of the specified group. */ static unsigned long nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) { unsigned long desc_block = group / nilfs_palloc_groups_per_desc_block(inode); return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; } /** * nilfs_palloc_bitmap_blkoff - get block offset of a bitmap block * @inode: inode of metadata file using this allocator * @group: group number * * nilfs_palloc_bitmap_blkoff() returns block offset of the bitmap * block used to allocate/deallocate entries in the specified group. */ static unsigned long nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) { unsigned long desc_offset = group % nilfs_palloc_groups_per_desc_block(inode); return nilfs_palloc_desc_blkoff(inode, group) + 1 + desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; } /** * nilfs_palloc_group_desc_nfrees - get the number of free entries in a group * @desc: pointer to descriptor structure for the group * @lock: spin lock protecting @desc */ static unsigned long nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc, spinlock_t *lock) { unsigned long nfree; spin_lock(lock); nfree = le32_to_cpu(desc->pg_nfrees); spin_unlock(lock); return nfree; } /** * nilfs_palloc_group_desc_add_entries - adjust count of free entries * @desc: pointer to descriptor structure for the group * @lock: spin lock protecting @desc * @n: delta to be added */ static u32 nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc, spinlock_t *lock, u32 n) { u32 nfree; spin_lock(lock); le32_add_cpu(&desc->pg_nfrees, n); nfree = le32_to_cpu(desc->pg_nfrees); spin_unlock(lock); return nfree; } /** * nilfs_palloc_entry_blkoff - get block offset of an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) */ static unsigned long nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) { unsigned long group, group_offset; group = nilfs_palloc_group(inode, nr, &group_offset); return nilfs_palloc_bitmap_blkoff(inode, group) + 1 + group_offset / NILFS_MDT(inode)->mi_entries_per_block; } /** * nilfs_palloc_desc_block_init - initialize buffer of a group descriptor block * @inode: inode of metadata file * @bh: buffer head of the buffer to be initialized * @from: kernel address mapped for a chunk of the block * * This function does not yet support the case where block size > PAGE_SIZE. */ static void nilfs_palloc_desc_block_init(struct inode *inode, struct buffer_head *bh, void *from) { struct nilfs_palloc_group_desc *desc = from; unsigned long n = nilfs_palloc_groups_per_desc_block(inode); __le32 nfrees; nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode)); while (n-- > 0) { desc->pg_nfrees = nfrees; desc++; } } static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, int create, void (*init_block)(struct inode *, struct buffer_head *, void *), struct buffer_head **bhp, struct nilfs_bh_assoc *prev, spinlock_t *lock) { int ret; spin_lock(lock); if (prev->bh && blkoff == prev->blkoff && likely(buffer_uptodate(prev->bh))) { get_bh(prev->bh); *bhp = prev->bh; spin_unlock(lock); return 0; } spin_unlock(lock); ret = nilfs_mdt_get_block(inode, blkoff, create, init_block, bhp); if (!ret) { spin_lock(lock); /* * The following code must be safe for change of the * cache contents during the get block call. */ brelse(prev->bh); get_bh(*bhp); prev->bh = *bhp; prev->blkoff = blkoff; spin_unlock(lock); } return ret; } /** * nilfs_palloc_delete_block - delete a block on the persistent allocator file * @inode: inode of metadata file using this allocator * @blkoff: block offset * @prev: nilfs_bh_assoc struct of the last used buffer * @lock: spin lock protecting @prev */ static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff, struct nilfs_bh_assoc *prev, spinlock_t *lock) { spin_lock(lock); if (prev->bh && blkoff == prev->blkoff) { brelse(prev->bh); prev->bh = NULL; } spin_unlock(lock); return nilfs_mdt_delete_block(inode, blkoff); } /** * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block * @inode: inode of metadata file using this allocator * @group: group number * @create: create flag * @bhp: pointer to store the resultant buffer head */ static int nilfs_palloc_get_desc_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; return nilfs_palloc_get_block(inode, nilfs_palloc_desc_blkoff(inode, group), create, nilfs_palloc_desc_block_init, bhp, &cache->prev_desc, &cache->lock); } /** * nilfs_palloc_get_bitmap_block - get buffer head of a bitmap block * @inode: inode of metadata file using this allocator * @group: group number * @create: create flag * @bhp: pointer to store the resultant buffer head */ static int nilfs_palloc_get_bitmap_block(struct inode *inode, unsigned long group, int create, struct buffer_head **bhp) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; return nilfs_palloc_get_block(inode, nilfs_palloc_bitmap_blkoff(inode, group), create, NULL, bhp, &cache->prev_bitmap, &cache->lock); } /** * nilfs_palloc_delete_bitmap_block - delete a bitmap block * @inode: inode of metadata file using this allocator * @group: group number */ static int nilfs_palloc_delete_bitmap_block(struct inode *inode, unsigned long group) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; return nilfs_palloc_delete_block(inode, nilfs_palloc_bitmap_blkoff(inode, group), &cache->prev_bitmap, &cache->lock); } /** * nilfs_palloc_get_entry_block - get buffer head of an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) * @create: create flag * @bhp: pointer to store the resultant buffer head */ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, int create, struct buffer_head **bhp) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; return nilfs_palloc_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr), create, NULL, bhp, &cache->prev_entry, &cache->lock); } /** * nilfs_palloc_delete_entry_block - delete an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry */ static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; return nilfs_palloc_delete_block(inode, nilfs_palloc_entry_blkoff(inode, nr), &cache->prev_entry, &cache->lock); } /** * nilfs_palloc_group_desc_offset - calculate the byte offset of a group * descriptor in the folio containing it * @inode: inode of metadata file using this allocator * @group: group number * @bh: buffer head of the group descriptor block * * Return: Byte offset in the folio of the group descriptor for @group. */ static size_t nilfs_palloc_group_desc_offset(const struct inode *inode, unsigned long group, const struct buffer_head *bh) { return offset_in_folio(bh->b_folio, bh->b_data) + sizeof(struct nilfs_palloc_group_desc) * (group % nilfs_palloc_groups_per_desc_block(inode)); } /** * nilfs_palloc_bitmap_offset - calculate the byte offset of a bitmap block * in the folio containing it * @bh: buffer head of the bitmap block * * Return: Byte offset in the folio of the bitmap block for @bh. */ static size_t nilfs_palloc_bitmap_offset(const struct buffer_head *bh) { return offset_in_folio(bh->b_folio, bh->b_data); } /** * nilfs_palloc_entry_offset - calculate the byte offset of an entry in the * folio containing it * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) * @bh: buffer head of the entry block * * Return: Byte offset in the folio of the entry @nr. */ size_t nilfs_palloc_entry_offset(const struct inode *inode, __u64 nr, const struct buffer_head *bh) { unsigned long entry_index_in_group, entry_index_in_block; nilfs_palloc_group(inode, nr, &entry_index_in_group); entry_index_in_block = entry_index_in_group % NILFS_MDT(inode)->mi_entries_per_block; return offset_in_folio(bh->b_folio, bh->b_data) + entry_index_in_block * NILFS_MDT(inode)->mi_entry_size; } /** * nilfs_palloc_find_available_slot - find available slot in a group * @bitmap: bitmap of the group * @target: offset number of an entry in the group (start point) * @bsize: size in bits * @lock: spin lock protecting @bitmap * @wrap: whether to wrap around */ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, unsigned int bsize, spinlock_t *lock, bool wrap) { int pos, end = bsize; if (likely(target < bsize)) { pos = target; do { pos = nilfs_find_next_zero_bit(bitmap, end, pos); if (pos >= end) break; if (!nilfs_set_bit_atomic(lock, pos, bitmap)) return pos; } while (++pos < end); end = target; } if (!wrap) return -ENOSPC; /* wrap around */ for (pos = 0; pos < end; pos++) { pos = nilfs_find_next_zero_bit(bitmap, end, pos); if (pos >= end) break; if (!nilfs_set_bit_atomic(lock, pos, bitmap)) return pos; } return -ENOSPC; } /** * nilfs_palloc_rest_groups_in_desc_block - get the remaining number of groups * in a group descriptor block * @inode: inode of metadata file using this allocator * @curr: current group number * @max: maximum number of groups */ static unsigned long nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, unsigned long curr, unsigned long max) { return min_t(unsigned long, nilfs_palloc_groups_per_desc_block(inode) - curr % nilfs_palloc_groups_per_desc_block(inode), max - curr + 1); } /** * nilfs_palloc_count_desc_blocks - count descriptor blocks number * @inode: inode of metadata file using this allocator * @desc_blocks: descriptor blocks number [out] */ static int nilfs_palloc_count_desc_blocks(struct inode *inode, unsigned long *desc_blocks) { __u64 blknum; int ret; ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum); if (likely(!ret)) *desc_blocks = DIV_ROUND_UP( (unsigned long)blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block); return ret; } /** * nilfs_palloc_mdt_file_can_grow - check potential opportunity for * MDT file growing * @inode: inode of metadata file using this allocator * @desc_blocks: known current descriptor blocks count */ static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode, unsigned long desc_blocks) { return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) < nilfs_palloc_groups_count(inode); } /** * nilfs_palloc_count_max_entries - count max number of entries that can be * described by descriptor blocks count * @inode: inode of metadata file using this allocator * @nused: current number of used entries * @nmaxp: max number of entries [out] */ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp) { unsigned long desc_blocks = 0; u64 entries_per_desc_block, nmax; int err; err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks); if (unlikely(err)) return err; entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) * nilfs_palloc_groups_per_desc_block(inode); nmax = entries_per_desc_block * desc_blocks; if (nused == nmax && nilfs_palloc_mdt_file_can_grow(inode, desc_blocks)) nmax += entries_per_desc_block; if (nused > nmax) return -ERANGE; *nmaxp = nmax; return 0; } /** * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation * @wrap: whether to wrap around */ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req, bool wrap) { struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; size_t doff, boff; unsigned long group, maxgroup, ngroups; unsigned long group_offset, maxgroup_offset; unsigned long n, entries_per_group; unsigned long i, j; spinlock_t *lock; int pos, ret; ngroups = nilfs_palloc_groups_count(inode); maxgroup = ngroups - 1; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); entries_per_group = nilfs_palloc_entries_per_group(inode); for (i = 0; i < ngroups; i += n) { if (group >= ngroups && wrap) { /* wrap around */ group = 0; maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr, &maxgroup_offset) - 1; } ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); if (ret < 0) return ret; doff = nilfs_palloc_group_desc_offset(inode, group, desc_bh); desc = kmap_local_folio(desc_bh->b_folio, doff); n = nilfs_palloc_rest_groups_in_desc_block(inode, group, maxgroup); for (j = 0; j < n; j++, group++, group_offset = 0) { lock = nilfs_mdt_bgl_lock(inode, group); if (nilfs_palloc_group_desc_nfrees(&desc[j], lock) == 0) continue; kunmap_local(desc); ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); if (unlikely(ret < 0)) { brelse(desc_bh); return ret; } /* * Re-kmap the folio containing the first (and * subsequent) group descriptors. */ desc = kmap_local_folio(desc_bh->b_folio, doff); boff = nilfs_palloc_bitmap_offset(bitmap_bh); bitmap = kmap_local_folio(bitmap_bh->b_folio, boff); pos = nilfs_palloc_find_available_slot( bitmap, group_offset, entries_per_group, lock, wrap); /* * Since the search for a free slot in the second and * subsequent bitmap blocks always starts from the * beginning, the wrap flag only has an effect on the * first search. */ kunmap_local(bitmap); if (pos >= 0) goto found; brelse(bitmap_bh); } kunmap_local(desc); brelse(desc_bh); } /* no entries left */ return -ENOSPC; found: /* found a free entry */ nilfs_palloc_group_desc_add_entries(&desc[j], lock, -1); req->pr_entry_nr = entries_per_group * group + pos; kunmap_local(desc); req->pr_desc_bh = desc_bh; req->pr_bitmap_bh = bitmap_bh; return 0; } /** * nilfs_palloc_commit_alloc_entry - finish allocation of a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation */ void nilfs_palloc_commit_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { mark_buffer_dirty(req->pr_bitmap_bh); mark_buffer_dirty(req->pr_desc_bh); nilfs_mdt_mark_dirty(inode); brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); } /** * nilfs_palloc_commit_free_entry - finish deallocating a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the removal */ void nilfs_palloc_commit_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { unsigned long group, group_offset; size_t doff, boff; struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); doff = nilfs_palloc_group_desc_offset(inode, group, req->pr_desc_bh); desc = kmap_local_folio(req->pr_desc_bh->b_folio, doff); boff = nilfs_palloc_bitmap_offset(req->pr_bitmap_bh); bitmap = kmap_local_folio(req->pr_bitmap_bh->b_folio, boff); lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warn(inode->i_sb, "%s (ino=%lu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)req->pr_entry_nr); else nilfs_palloc_group_desc_add_entries(desc, lock, 1); kunmap_local(bitmap); kunmap_local(desc); mark_buffer_dirty(req->pr_desc_bh); mark_buffer_dirty(req->pr_bitmap_bh); nilfs_mdt_mark_dirty(inode); brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); } /** * nilfs_palloc_abort_alloc_entry - cancel allocation of a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the allocation */ void nilfs_palloc_abort_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { struct nilfs_palloc_group_desc *desc; size_t doff, boff; unsigned char *bitmap; unsigned long group, group_offset; spinlock_t *lock; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); doff = nilfs_palloc_group_desc_offset(inode, group, req->pr_desc_bh); desc = kmap_local_folio(req->pr_desc_bh->b_folio, doff); boff = nilfs_palloc_bitmap_offset(req->pr_bitmap_bh); bitmap = kmap_local_folio(req->pr_bitmap_bh->b_folio, boff); lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warn(inode->i_sb, "%s (ino=%lu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)req->pr_entry_nr); else nilfs_palloc_group_desc_add_entries(desc, lock, 1); kunmap_local(bitmap); kunmap_local(desc); brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); req->pr_entry_nr = 0; req->pr_bitmap_bh = NULL; req->pr_desc_bh = NULL; } /** * nilfs_palloc_prepare_free_entry - prepare to deallocate a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the removal */ int nilfs_palloc_prepare_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { struct buffer_head *desc_bh, *bitmap_bh; unsigned long group, group_offset; int ret; group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); if (ret < 0) return ret; ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); if (ret < 0) { brelse(desc_bh); return ret; } req->pr_desc_bh = desc_bh; req->pr_bitmap_bh = bitmap_bh; return 0; } /** * nilfs_palloc_abort_free_entry - cancel deallocating a persistent object * @inode: inode of metadata file using this allocator * @req: nilfs_palloc_req structure exchanged for the removal */ void nilfs_palloc_abort_free_entry(struct inode *inode, struct nilfs_palloc_req *req) { brelse(req->pr_bitmap_bh); brelse(req->pr_desc_bh); req->pr_entry_nr = 0; req->pr_bitmap_bh = NULL; req->pr_desc_bh = NULL; } /** * nilfs_palloc_freev - deallocate a set of persistent objects * @inode: inode of metadata file using this allocator * @entry_nrs: array of entry numbers to be deallocated * @nitems: number of entries stored in @entry_nrs */ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) { struct buffer_head *desc_bh, *bitmap_bh; struct nilfs_palloc_group_desc *desc; unsigned char *bitmap; size_t doff, boff; unsigned long group, group_offset; __u64 group_min_nr, last_nrs[8]; const unsigned long epg = nilfs_palloc_entries_per_group(inode); const unsigned int epb = NILFS_MDT(inode)->mi_entries_per_block; unsigned int entry_start, end, pos; spinlock_t *lock; int i, j, k, ret; u32 nfree; for (i = 0; i < nitems; i = j) { int change_group = false; int nempties = 0, n = 0; group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); if (ret < 0) return ret; ret = nilfs_palloc_get_bitmap_block(inode, group, 0, &bitmap_bh); if (ret < 0) { brelse(desc_bh); return ret; } /* Get the first entry number of the group */ group_min_nr = (__u64)group * epg; boff = nilfs_palloc_bitmap_offset(bitmap_bh); bitmap = kmap_local_folio(bitmap_bh->b_folio, boff); lock = nilfs_mdt_bgl_lock(inode, group); j = i; entry_start = rounddown(group_offset, epb); do { if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { nilfs_warn(inode->i_sb, "%s (ino=%lu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)entry_nrs[j]); } else { n++; } j++; if (j >= nitems || entry_nrs[j] < group_min_nr || entry_nrs[j] >= group_min_nr + epg) { change_group = true; } else { group_offset = entry_nrs[j] - group_min_nr; if (group_offset >= entry_start && group_offset < entry_start + epb) { /* This entry is in the same block */ continue; } } /* Test if the entry block is empty or not */ end = entry_start + epb; pos = nilfs_find_next_bit(bitmap, end, entry_start); if (pos >= end) { last_nrs[nempties++] = entry_nrs[j - 1]; if (nempties >= ARRAY_SIZE(last_nrs)) break; } if (change_group) break; /* Go on to the next entry block */ entry_start = rounddown(group_offset, epb); } while (true); kunmap_local(bitmap); mark_buffer_dirty(bitmap_bh); brelse(bitmap_bh); for (k = 0; k < nempties; k++) { ret = nilfs_palloc_delete_entry_block(inode, last_nrs[k]); if (ret && ret != -ENOENT) nilfs_warn(inode->i_sb, "error %d deleting block that object (entry=%llu, ino=%lu) belongs to", ret, (unsigned long long)last_nrs[k], inode->i_ino); } doff = nilfs_palloc_group_desc_offset(inode, group, desc_bh); desc = kmap_local_folio(desc_bh->b_folio, doff); nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n); kunmap_local(desc); mark_buffer_dirty(desc_bh); nilfs_mdt_mark_dirty(inode); brelse(desc_bh); if (nfree == nilfs_palloc_entries_per_group(inode)) { ret = nilfs_palloc_delete_bitmap_block(inode, group); if (ret && ret != -ENOENT) nilfs_warn(inode->i_sb, "error %d deleting bitmap block of group=%lu, ino=%lu", ret, group, inode->i_ino); } } return 0; } void nilfs_palloc_setup_cache(struct inode *inode, struct nilfs_palloc_cache *cache) { NILFS_MDT(inode)->mi_palloc_cache = cache; spin_lock_init(&cache->lock); } void nilfs_palloc_clear_cache(struct inode *inode) { struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; spin_lock(&cache->lock); brelse(cache->prev_desc.bh); brelse(cache->prev_bitmap.bh); brelse(cache->prev_entry.bh); cache->prev_desc.bh = NULL; cache->prev_bitmap.bh = NULL; cache->prev_entry.bh = NULL; spin_unlock(&cache->lock); } void nilfs_palloc_destroy_cache(struct inode *inode) { nilfs_palloc_clear_cache(inode); NILFS_MDT(inode)->mi_palloc_cache = NULL; }
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 // SPDX-License-Identifier: GPL-2.0 // // Register map access API - debugfs // // Copyright 2011 Wolfson Microelectronics plc // // Author: Mark Brown <broonie@opensource.wolfsonmicro.com> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/device.h> #include <linux/list.h> #include "internal.h" struct regmap_debugfs_node { struct regmap *map; struct list_head link; }; static unsigned int dummy_index; static struct dentry *regmap_debugfs_root; static LIST_HEAD(regmap_debugfs_early_list); static DEFINE_MUTEX(regmap_debugfs_early_lock); /* Calculate the length of a fixed format */ static size_t regmap_calc_reg_len(int max_val) { return snprintf(NULL, 0, "%x", max_val); } static ssize_t regmap_name_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct regmap *map = file->private_data; const char *name = "nodev"; int ret; char *buf; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; if (map->dev && map->dev->driver) name = map->dev->driver->name; ret = snprintf(buf, PAGE_SIZE, "%s\n", name); if (ret >= PAGE_SIZE) { kfree(buf); return ret; } ret = simple_read_from_buffer(user_buf, count, ppos, buf, ret); kfree(buf); return ret; } static const struct file_operations regmap_name_fops = { .open = simple_open, .read = regmap_name_read_file, .llseek = default_llseek, }; static void regmap_debugfs_free_dump_cache(struct regmap *map) { struct regmap_debugfs_off_cache *c; while (!list_empty(&map->debugfs_off_cache)) { c = list_first_entry(&map->debugfs_off_cache, struct regmap_debugfs_off_cache, list); list_del(&c->list); kfree(c); } } static bool regmap_printable(struct regmap *map, unsigned int reg) { if (regmap_precious(map, reg)) return false; if (!regmap_readable(map, reg) && !regmap_cached(map, reg)) return false; return true; } /* * Work out where the start offset maps into register numbers, bearing * in mind that we suppress hidden registers. */ static unsigned int regmap_debugfs_get_dump_start(struct regmap *map, unsigned int base, loff_t from, loff_t *pos) { struct regmap_debugfs_off_cache *c = NULL; loff_t p = 0; unsigned int i, ret; unsigned int fpos_offset; unsigned int reg_offset; /* Suppress the cache if we're using a subrange */ if (base) return base; /* * If we don't have a cache build one so we don't have to do a * linear scan each time. */ mutex_lock(&map->cache_lock); i = base; if (list_empty(&map->debugfs_off_cache)) { for (; i <= map->max_register; i += map->reg_stride) { /* Skip unprinted registers, closing off cache entry */ if (!regmap_printable(map, i)) { if (c) { c->max = p - 1; c->max_reg = i - map->reg_stride; list_add_tail(&c->list, &map->debugfs_off_cache); c = NULL; } continue; } /* No cache entry? Start a new one */ if (!c) { c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) { regmap_debugfs_free_dump_cache(map); mutex_unlock(&map->cache_lock); return base; } c->min = p; c->base_reg = i; } p += map->debugfs_tot_len; } } /* Close the last entry off if we didn't scan beyond it */ if (c) { c->max = p - 1; c->max_reg = i - map->reg_stride; list_add_tail(&c->list, &map->debugfs_off_cache); } /* * This should never happen; we return above if we fail to * allocate and we should never be in this code if there are * no registers at all. */ WARN_ON(list_empty(&map->debugfs_off_cache)); ret = base; /* Find the relevant block:offset */ list_for_each_entry(c, &map->debugfs_off_cache, list) { if (from >= c->min && from <= c->max) { fpos_offset = from - c->min; reg_offset = fpos_offset / map->debugfs_tot_len; *pos = c->min + (reg_offset * map->debugfs_tot_len); mutex_unlock(&map->cache_lock); return c->base_reg + (reg_offset * map->reg_stride); } *pos = c->max; ret = c->max_reg; } mutex_unlock(&map->cache_lock); return ret; } static inline void regmap_calc_tot_len(struct regmap *map, void *buf, size_t count) { /* Calculate the length of a fixed format */ if (!map->debugfs_tot_len) { map->debugfs_reg_len = regmap_calc_reg_len(map->max_register); map->debugfs_val_len = 2 * map->format.val_bytes; map->debugfs_tot_len = map->debugfs_reg_len + map->debugfs_val_len + 3; /* : \n */ } } static int regmap_next_readable_reg(struct regmap *map, int reg) { struct regmap_debugfs_off_cache *c; int ret = -EINVAL; if (regmap_printable(map, reg + map->reg_stride)) { ret = reg + map->reg_stride; } else { mutex_lock(&map->cache_lock); list_for_each_entry(c, &map->debugfs_off_cache, list) { if (reg > c->max_reg) continue; if (reg < c->base_reg) { ret = c->base_reg; break; } } mutex_unlock(&map->cache_lock); } return ret; } static ssize_t regmap_read_debugfs(struct regmap *map, unsigned int from, unsigned int to, char __user *user_buf, size_t count, loff_t *ppos) { size_t buf_pos = 0; loff_t p = *ppos; ssize_t ret; int i; char *buf; unsigned int val, start_reg; if (*ppos < 0 || !count) return -EINVAL; if (count > (PAGE_SIZE << MAX_PAGE_ORDER)) count = PAGE_SIZE << MAX_PAGE_ORDER; buf = kmalloc(count, GFP_KERNEL); if (!buf) return -ENOMEM; regmap_calc_tot_len(map, buf, count); /* Work out which register we're starting at */ start_reg = regmap_debugfs_get_dump_start(map, from, *ppos, &p); for (i = start_reg; i >= 0 && i <= to; i = regmap_next_readable_reg(map, i)) { /* If we're in the region the user is trying to read */ if (p >= *ppos) { /* ...but not beyond it */ if (buf_pos + map->debugfs_tot_len > count) break; /* Format the register */ snprintf(buf + buf_pos, count - buf_pos, "%.*x: ", map->debugfs_reg_len, i - from); buf_pos += map->debugfs_reg_len + 2; /* Format the value, write all X if we can't read */ ret = regmap_read(map, i, &val); if (ret == 0) snprintf(buf + buf_pos, count - buf_pos, "%.*x", map->debugfs_val_len, val); else memset(buf + buf_pos, 'X', map->debugfs_val_len); buf_pos += 2 * map->format.val_bytes; buf[buf_pos++] = '\n'; } p += map->debugfs_tot_len; } ret = buf_pos; if (copy_to_user(user_buf, buf, buf_pos)) { ret = -EFAULT; goto out; } *ppos += buf_pos; out: kfree(buf); return ret; } static ssize_t regmap_map_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct regmap *map = file->private_data; return regmap_read_debugfs(map, 0, map->max_register, user_buf, count, ppos); } #undef REGMAP_ALLOW_WRITE_DEBUGFS #ifdef REGMAP_ALLOW_WRITE_DEBUGFS /* * This can be dangerous especially when we have clients such as * PMICs, therefore don't provide any real compile time configuration option * for this feature, people who want to use this will need to modify * the source code directly. */ static ssize_t regmap_map_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; size_t buf_size; char *start = buf; unsigned long reg, value; struct regmap *map = file->private_data; int ret; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; buf[buf_size] = 0; while (*start == ' ') start++; reg = simple_strtoul(start, &start, 16); while (*start == ' ') start++; if (kstrtoul(start, 16, &value)) return -EINVAL; /* Userspace has been fiddling around behind the kernel's back */ add_taint(TAINT_USER, LOCKDEP_STILL_OK); ret = regmap_write(map, reg, value); if (ret < 0) return ret; return buf_size; } #else #define regmap_map_write_file NULL #endif static const struct file_operations regmap_map_fops = { .open = simple_open, .read = regmap_map_read_file, .write = regmap_map_write_file, .llseek = default_llseek, }; static ssize_t regmap_range_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct regmap_range_node *range = file->private_data; struct regmap *map = range->map; return regmap_read_debugfs(map, range->range_min, range->range_max, user_buf, count, ppos); } static const struct file_operations regmap_range_fops = { .open = simple_open, .read = regmap_range_read_file, .llseek = default_llseek, }; static ssize_t regmap_reg_ranges_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct regmap *map = file->private_data; struct regmap_debugfs_off_cache *c; loff_t p = 0; size_t buf_pos = 0; char *buf; char *entry; int ret; unsigned int entry_len; if (*ppos < 0 || !count) return -EINVAL; if (count > (PAGE_SIZE << MAX_PAGE_ORDER)) count = PAGE_SIZE << MAX_PAGE_ORDER; buf = kmalloc(count, GFP_KERNEL); if (!buf) return -ENOMEM; entry = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!entry) { kfree(buf); return -ENOMEM; } /* While we are at it, build the register dump cache * now so the read() operation on the `registers' file * can benefit from using the cache. We do not care * about the file position information that is contained * in the cache, just about the actual register blocks */ regmap_calc_tot_len(map, buf, count); regmap_debugfs_get_dump_start(map, 0, *ppos, &p); /* Reset file pointer as the fixed-format of the `registers' * file is not compatible with the `range' file */ p = 0; mutex_lock(&map->cache_lock); list_for_each_entry(c, &map->debugfs_off_cache, list) { entry_len = snprintf(entry, PAGE_SIZE, "%x-%x\n", c->base_reg, c->max_reg); if (p >= *ppos) { if (buf_pos + entry_len > count) break; memcpy(buf + buf_pos, entry, entry_len); buf_pos += entry_len; } p += entry_len; } mutex_unlock(&map->cache_lock); kfree(entry); ret = buf_pos; if (copy_to_user(user_buf, buf, buf_pos)) { ret = -EFAULT; goto out_buf; } *ppos += buf_pos; out_buf: kfree(buf); return ret; } static const struct file_operations regmap_reg_ranges_fops = { .open = simple_open, .read = regmap_reg_ranges_read_file, .llseek = default_llseek, }; static int regmap_access_show(struct seq_file *s, void *ignored) { struct regmap *map = s->private; int i, reg_len; reg_len = regmap_calc_reg_len(map->max_register); for (i = 0; i <= map->max_register; i += map->reg_stride) { /* Ignore registers which are neither readable nor writable */ if (!regmap_readable(map, i) && !regmap_writeable(map, i)) continue; /* Format the register */ seq_printf(s, "%.*x: %c %c %c %c\n", reg_len, i, regmap_readable(map, i) ? 'y' : 'n', regmap_writeable(map, i) ? 'y' : 'n', regmap_volatile(map, i) ? 'y' : 'n', regmap_precious(map, i) ? 'y' : 'n'); } return 0; } DEFINE_SHOW_ATTRIBUTE(regmap_access); static ssize_t regmap_cache_only_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct regmap *map = container_of(file->private_data, struct regmap, cache_only); bool new_val, require_sync = false; int err; err = kstrtobool_from_user(user_buf, count, &new_val); /* Ignore malforned data like debugfs_write_file_bool() */ if (err) return count; err = debugfs_file_get(file->f_path.dentry); if (err) return err; map->lock(map->lock_arg); if (new_val && !map->cache_only) { dev_warn(map->dev, "debugfs cache_only=Y forced\n"); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } else if (!new_val && map->cache_only) { dev_warn(map->dev, "debugfs cache_only=N forced: syncing cache\n"); require_sync = true; } map->cache_only = new_val; map->unlock(map->lock_arg); debugfs_file_put(file->f_path.dentry); if (require_sync) { err = regcache_sync(map); if (err) dev_err(map->dev, "Failed to sync cache %d\n", err); } return count; } static const struct file_operations regmap_cache_only_fops = { .open = simple_open, .read = debugfs_read_file_bool, .write = regmap_cache_only_write_file, }; static ssize_t regmap_cache_bypass_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct regmap *map = container_of(file->private_data, struct regmap, cache_bypass); bool new_val; int err; err = kstrtobool_from_user(user_buf, count, &new_val); /* Ignore malforned data like debugfs_write_file_bool() */ if (err) return count; err = debugfs_file_get(file->f_path.dentry); if (err) return err; map->lock(map->lock_arg); if (new_val && !map->cache_bypass) { dev_warn(map->dev, "debugfs cache_bypass=Y forced\n"); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } else if (!new_val && map->cache_bypass) { dev_warn(map->dev, "debugfs cache_bypass=N forced\n"); } map->cache_bypass = new_val; map->unlock(map->lock_arg); debugfs_file_put(file->f_path.dentry); return count; } static const struct file_operations regmap_cache_bypass_fops = { .open = simple_open, .read = debugfs_read_file_bool, .write = regmap_cache_bypass_write_file, }; void regmap_debugfs_init(struct regmap *map) { struct rb_node *next; struct regmap_range_node *range_node; const char *devname = "dummy"; const char *name = map->name; /* * Userspace can initiate reads from the hardware over debugfs. * Normally internal regmap structures and buffers are protected with * a mutex or a spinlock, but if the regmap owner decided to disable * all locking mechanisms, this is no longer the case. For safety: * don't create the debugfs entries if locking is disabled. */ if (map->debugfs_disable) { dev_dbg(map->dev, "regmap locking disabled - not creating debugfs entries\n"); return; } /* If we don't have the debugfs root yet, postpone init */ if (!regmap_debugfs_root) { struct regmap_debugfs_node *node; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return; node->map = map; mutex_lock(&regmap_debugfs_early_lock); list_add(&node->link, &regmap_debugfs_early_list); mutex_unlock(&regmap_debugfs_early_lock); return; } INIT_LIST_HEAD(&map->debugfs_off_cache); mutex_init(&map->cache_lock); if (map->dev) devname = dev_name(map->dev); if (name) { if (!map->debugfs_name) { map->debugfs_name = kasprintf(GFP_KERNEL, "%s-%s", devname, name); if (!map->debugfs_name) return; } name = map->debugfs_name; } else { name = devname; } if (!strcmp(name, "dummy")) { kfree(map->debugfs_name); map->debugfs_name = kasprintf(GFP_KERNEL, "dummy%d", dummy_index); if (!map->debugfs_name) return; name = map->debugfs_name; dummy_index++; } map->debugfs = debugfs_create_dir(name, regmap_debugfs_root); debugfs_create_file("name", 0400, map->debugfs, map, &regmap_name_fops); debugfs_create_file("range", 0400, map->debugfs, map, &regmap_reg_ranges_fops); if (map->max_register || regmap_readable(map, 0)) { umode_t registers_mode; #if defined(REGMAP_ALLOW_WRITE_DEBUGFS) registers_mode = 0600; #else registers_mode = 0400; #endif debugfs_create_file("registers", registers_mode, map->debugfs, map, &regmap_map_fops); debugfs_create_file("access", 0400, map->debugfs, map, &regmap_access_fops); } if (map->cache_type) { debugfs_create_file("cache_only", 0600, map->debugfs, &map->cache_only, &regmap_cache_only_fops); debugfs_create_bool("cache_dirty", 0400, map->debugfs, &map->cache_dirty); debugfs_create_file("cache_bypass", 0600, map->debugfs, &map->cache_bypass, &regmap_cache_bypass_fops); } /* * This could interfere with driver operation. Therefore, don't provide * any real compile time configuration option for this feature. One will * have to modify the source code directly in order to use it. */ #undef REGMAP_ALLOW_FORCE_WRITE_FIELD_DEBUGFS #ifdef REGMAP_ALLOW_FORCE_WRITE_FIELD_DEBUGFS debugfs_create_bool("force_write_field", 0600, map->debugfs, &map->force_write_field); #endif next = rb_first(&map->range_tree); while (next) { range_node = rb_entry(next, struct regmap_range_node, node); if (range_node->name) debugfs_create_file(range_node->name, 0400, map->debugfs, range_node, &regmap_range_fops); next = rb_next(&range_node->node); } if (map->cache_ops && map->cache_ops->debugfs_init) map->cache_ops->debugfs_init(map); } void regmap_debugfs_exit(struct regmap *map) { if (map->debugfs) { debugfs_remove_recursive(map->debugfs); mutex_lock(&map->cache_lock); regmap_debugfs_free_dump_cache(map); mutex_unlock(&map->cache_lock); kfree(map->debugfs_name); map->debugfs_name = NULL; } else { struct regmap_debugfs_node *node, *tmp; mutex_lock(&regmap_debugfs_early_lock); list_for_each_entry_safe(node, tmp, &regmap_debugfs_early_list, link) { if (node->map == map) { list_del(&node->link); kfree(node); } } mutex_unlock(&regmap_debugfs_early_lock); } } void regmap_debugfs_initcall(void) { struct regmap_debugfs_node *node, *tmp; regmap_debugfs_root = debugfs_create_dir("regmap", NULL); mutex_lock(&regmap_debugfs_early_lock); list_for_each_entry_safe(node, tmp, &regmap_debugfs_early_list, link) { regmap_debugfs_init(node->map); list_del(&node->link); kfree(node); } mutex_unlock(&regmap_debugfs_early_lock); }
1 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 // SPDX-License-Identifier: GPL-2.0 /* * AIRcable USB Bluetooth Dongle Driver. * * Copyright (C) 2010 Johan Hovold <jhovold@gmail.com> * Copyright (C) 2006 Manuel Francisco Naranjo (naranjo.manuel@gmail.com) * * The device works as an standard CDC device, it has 2 interfaces, the first * one is for firmware access and the second is the serial one. * The protocol is very simply, there are two possibilities reading or writing. * When writing the first urb must have a Header that starts with 0x20 0x29 the * next two bytes must say how much data will be sent. * When reading the process is almost equal except that the header starts with * 0x00 0x20. * * The device simply need some stuff to understand data coming from the usb * buffer: The First and Second byte is used for a Header, the Third and Fourth * tells the device the amount of information the package holds. * Packages are 60 bytes long Header Stuff. * When writing to the device the first two bytes of the header are 0x20 0x29 * When reading the bytes are 0x00 0x20, or 0x00 0x10, there is an strange * situation, when too much data arrives to the device because it sends the data * but with out the header. I will use a simply hack to override this situation, * if there is data coming that does not contain any header, then that is data * that must go directly to the tty, as there is no documentation about if there * is any other control code, I will simply check for the first * one. * * I have taken some info from a Greg Kroah-Hartman article: * http://www.linuxjournal.com/article/6573 * And from Linux Device Driver Kit CD, which is a great work, the authors taken * the work to recompile lots of information an knowledge in drivers development * and made it all available inside a cd. * URL: http://kernel.org/pub/linux/kernel/people/gregkh/ddk/ * */ #include <linux/unaligned.h> #include <linux/tty.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/tty_flip.h> #include <linux/usb.h> #include <linux/usb/serial.h> /* Vendor and Product ID */ #define AIRCABLE_VID 0x16CA #define AIRCABLE_USB_PID 0x1502 /* Protocol Stuff */ #define HCI_HEADER_LENGTH 0x4 #define TX_HEADER_0 0x20 #define TX_HEADER_1 0x29 #define RX_HEADER_0 0x00 #define RX_HEADER_1 0x20 #define HCI_COMPLETE_FRAME 64 /* rx_flags */ #define THROTTLED 0x01 #define ACTUALLY_THROTTLED 0x02 #define DRIVER_AUTHOR "Naranjo, Manuel Francisco <naranjo.manuel@gmail.com>, Johan Hovold <jhovold@gmail.com>" #define DRIVER_DESC "AIRcable USB Driver" /* ID table that will be registered with USB core */ static const struct usb_device_id id_table[] = { { USB_DEVICE(AIRCABLE_VID, AIRCABLE_USB_PID) }, { }, }; MODULE_DEVICE_TABLE(usb, id_table); static int aircable_prepare_write_buffer(struct usb_serial_port *port, void *dest, size_t size) { int count; unsigned char *buf = dest; count = kfifo_out_locked(&port->write_fifo, buf + HCI_HEADER_LENGTH, size - HCI_HEADER_LENGTH, &port->lock); buf[0] = TX_HEADER_0; buf[1] = TX_HEADER_1; put_unaligned_le16(count, &buf[2]); return count + HCI_HEADER_LENGTH; } static int aircable_calc_num_ports(struct usb_serial *serial, struct usb_serial_endpoints *epds) { /* Ignore the first interface, which has no bulk endpoints. */ if (epds->num_bulk_out == 0) { dev_dbg(&serial->interface->dev, "ignoring interface with no bulk-out endpoints\n"); return -ENODEV; } return 1; } static int aircable_process_packet(struct usb_serial_port *port, int has_headers, char *packet, int len) { if (has_headers) { len -= HCI_HEADER_LENGTH; packet += HCI_HEADER_LENGTH; } if (len <= 0) { dev_dbg(&port->dev, "%s - malformed packet\n", __func__); return 0; } tty_insert_flip_string(&port->port, packet, len); return len; } static void aircable_process_read_urb(struct urb *urb) { struct usb_serial_port *port = urb->context; char *data = urb->transfer_buffer; int has_headers; int count; int len; int i; has_headers = (urb->actual_length > 2 && data[0] == RX_HEADER_0); count = 0; for (i = 0; i < urb->actual_length; i += HCI_COMPLETE_FRAME) { len = min_t(int, urb->actual_length - i, HCI_COMPLETE_FRAME); count += aircable_process_packet(port, has_headers, &data[i], len); } if (count) tty_flip_buffer_push(&port->port); } static struct usb_serial_driver aircable_device = { .driver = { .name = "aircable", }, .id_table = id_table, .bulk_out_size = HCI_COMPLETE_FRAME, .calc_num_ports = aircable_calc_num_ports, .process_read_urb = aircable_process_read_urb, .prepare_write_buffer = aircable_prepare_write_buffer, .throttle = usb_serial_generic_throttle, .unthrottle = usb_serial_generic_unthrottle, }; static struct usb_serial_driver * const serial_drivers[] = { &aircable_device, NULL }; module_usb_serial_driver(serial_drivers, id_table); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2");
4 4 2 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/netfilter/xt_devgroup.h> #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: Device group match"); MODULE_ALIAS("ipt_devgroup"); MODULE_ALIAS("ip6t_devgroup"); static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_devgroup_info *info = par->matchinfo; if (info->flags & XT_DEVGROUP_MATCH_SRC && (((info->src_group ^ xt_in(par)->group) & info->src_mask ? 1 : 0) ^ ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0))) return false; if (info->flags & XT_DEVGROUP_MATCH_DST && (((info->dst_group ^ xt_out(par)->group) & info->dst_mask ? 1 : 0) ^ ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0))) return false; return true; } static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) { const struct xt_devgroup_info *info = par->matchinfo; if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) return -EINVAL; if (info->flags & XT_DEVGROUP_MATCH_SRC && par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD))) return -EINVAL; if (info->flags & XT_DEVGROUP_MATCH_DST && par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) return -EINVAL; return 0; } static struct xt_match devgroup_mt_reg __read_mostly = { .name = "devgroup", .match = devgroup_mt, .checkentry = devgroup_mt_checkentry, .matchsize = sizeof(struct xt_devgroup_info), .family = NFPROTO_UNSPEC, .me = THIS_MODULE }; static int __init devgroup_mt_init(void) { return xt_register_match(&devgroup_mt_reg); } static void __exit devgroup_mt_exit(void) { xt_unregister_match(&devgroup_mt_reg); } module_init(devgroup_mt_init); module_exit(devgroup_mt_exit);
236 8 144 5 249 12 77 167 161 231 82 2 167 176 16 176 7 163 163 43 135 55 77 137 1 134 68 152 163 38 6 8 171 166 3 32 8 1 14 89 2 34 34 1 6 117 9 2 139 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 /* SPDX-License-Identifier: GPL-2.0-only */ /* * * Copyright (C) 2011 Novell Inc. */ #include <linux/kernel.h> #include <linux/uuid.h> #include <linux/fs.h> #include <linux/fsverity.h> #include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include "ovl_entry.h" #undef pr_fmt #define pr_fmt(fmt) "overlayfs: " fmt enum ovl_path_type { __OVL_PATH_UPPER = (1 << 0), __OVL_PATH_MERGE = (1 << 1), __OVL_PATH_ORIGIN = (1 << 2), }; #define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) #define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) #define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN) #define OVL_XATTR_NAMESPACE "overlay." #define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE #define OVL_XATTR_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) #define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE #define OVL_XATTR_USER_PREFIX_LEN (sizeof(OVL_XATTR_USER_PREFIX) - 1) #define OVL_XATTR_ESCAPE_PREFIX OVL_XATTR_NAMESPACE #define OVL_XATTR_ESCAPE_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_PREFIX) - 1) #define OVL_XATTR_ESCAPE_TRUSTED_PREFIX OVL_XATTR_TRUSTED_PREFIX OVL_XATTR_ESCAPE_PREFIX #define OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_TRUSTED_PREFIX) - 1) #define OVL_XATTR_ESCAPE_USER_PREFIX OVL_XATTR_USER_PREFIX OVL_XATTR_ESCAPE_PREFIX #define OVL_XATTR_ESCAPE_USER_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_USER_PREFIX) - 1) enum ovl_xattr { OVL_XATTR_OPAQUE, OVL_XATTR_REDIRECT, OVL_XATTR_ORIGIN, OVL_XATTR_IMPURE, OVL_XATTR_NLINK, OVL_XATTR_UPPER, OVL_XATTR_UUID, OVL_XATTR_METACOPY, OVL_XATTR_PROTATTR, OVL_XATTR_XWHITEOUT, }; enum ovl_inode_flag { /* Pure upper dir that may contain non pure upper entries */ OVL_IMPURE, /* Non-merge dir that may contain whiteout entries */ OVL_WHITEOUTS, OVL_INDEX, OVL_UPPERDATA, /* Inode number will remain constant over copy up. */ OVL_CONST_INO, OVL_HAS_DIGEST, OVL_VERIFIED_DIGEST, }; enum ovl_entry_flag { OVL_E_UPPER_ALIAS, OVL_E_OPAQUE, OVL_E_CONNECTED, /* Lower stack may contain xwhiteout entries */ OVL_E_XWHITEOUTS, }; enum { OVL_REDIRECT_OFF, /* "off" mode is never used. In effect */ OVL_REDIRECT_FOLLOW, /* ...it translates to either "follow" */ OVL_REDIRECT_NOFOLLOW, /* ...or "nofollow". */ OVL_REDIRECT_ON, }; enum { OVL_UUID_OFF, OVL_UUID_NULL, OVL_UUID_AUTO, OVL_UUID_ON, }; enum { OVL_XINO_OFF, OVL_XINO_AUTO, OVL_XINO_ON, }; enum { OVL_VERITY_OFF, OVL_VERITY_ON, OVL_VERITY_REQUIRE, }; /* * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, * where: * origin.fh - exported file handle of the lower file * origin.uuid - uuid of the lower filesystem */ #define OVL_FH_VERSION 0 #define OVL_FH_MAGIC 0xfb /* CPU byte order required for fid decoding: */ #define OVL_FH_FLAG_BIG_ENDIAN (1 << 0) #define OVL_FH_FLAG_ANY_ENDIAN (1 << 1) /* Is the real inode encoded in fid an upper inode? */ #define OVL_FH_FLAG_PATH_UPPER (1 << 2) #define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN | \ OVL_FH_FLAG_PATH_UPPER) #if defined(__LITTLE_ENDIAN) #define OVL_FH_FLAG_CPU_ENDIAN 0 #elif defined(__BIG_ENDIAN) #define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN #else #error Endianness not defined #endif /* The type used to be returned by overlay exportfs for misaligned fid */ #define OVL_FILEID_V0 0xfb /* The type returned by overlay exportfs for 32bit aligned fid */ #define OVL_FILEID_V1 0xf8 /* On-disk format for "origin" file handle */ struct ovl_fb { u8 version; /* 0 */ u8 magic; /* 0xfb */ u8 len; /* size of this header + size of fid */ u8 flags; /* OVL_FH_FLAG_* */ u8 type; /* fid_type of fid */ uuid_t uuid; /* uuid of filesystem */ u32 fid[]; /* file identifier should be 32bit aligned in-memory */ } __packed; /* In-memory and on-wire format for overlay file handle */ struct ovl_fh { u8 padding[3]; /* make sure fb.fid is 32bit aligned */ union { struct ovl_fb fb; DECLARE_FLEX_ARRAY(u8, buf); }; } __packed; #define OVL_FH_WIRE_OFFSET offsetof(struct ovl_fh, fb) #define OVL_FH_LEN(fh) (OVL_FH_WIRE_OFFSET + (fh)->fb.len) #define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ offsetof(struct ovl_fb, fid)) /* On-disk format for "metacopy" xattr (if non-zero size) */ struct ovl_metacopy { u8 version; /* 0 */ u8 len; /* size of this header + used digest bytes */ u8 flags; u8 digest_algo; /* FS_VERITY_HASH_ALG_* constant, 0 for no digest */ u8 digest[FS_VERITY_MAX_DIGEST_SIZE]; /* Only the used part on disk */ } __packed; #define OVL_METACOPY_MAX_SIZE (sizeof(struct ovl_metacopy)) #define OVL_METACOPY_MIN_SIZE (OVL_METACOPY_MAX_SIZE - FS_VERITY_MAX_DIGEST_SIZE) #define OVL_METACOPY_INIT { 0, OVL_METACOPY_MIN_SIZE } static inline int ovl_metadata_digest_size(const struct ovl_metacopy *metacopy) { if (metacopy->len < OVL_METACOPY_MIN_SIZE) return 0; return (int)metacopy->len - OVL_METACOPY_MIN_SIZE; } /* No atime modification on underlying */ #define OVL_OPEN_FLAGS (O_NOATIME) extern const char *const ovl_xattr_table[][2]; static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) { return ovl_xattr_table[ox][ofs->config.userxattr]; } /* * When changing ownership of an upper object map the intended ownership * according to the upper layer's idmapping. When an upper mount idmaps files * that are stored on-disk as owned by id 1001 to id 1000 this means stat on * this object will report it as being owned by id 1000 when calling stat via * the upper mount. * In order to change ownership of an object so stat reports id 1000 when * called on an idmapped upper mount the value written to disk - i.e., the * value stored in ia_*id - must 1001. The mount mapping helper will thus take * care to map 1000 to 1001. * The mnt idmapping helpers are nops if the upper layer isn't idmapped. */ static inline int ovl_do_notify_change(struct ovl_fs *ofs, struct dentry *upperdentry, struct iattr *attr) { return notify_change(ovl_upper_mnt_idmap(ofs), upperdentry, attr, NULL); } static inline int ovl_do_rmdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { int err = vfs_rmdir(ovl_upper_mnt_idmap(ofs), dir, dentry); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; } static inline int ovl_do_unlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { int err = vfs_unlink(ovl_upper_mnt_idmap(ofs), dir, dentry, NULL); pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; } static inline int ovl_do_link(struct ovl_fs *ofs, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { int err = vfs_link(old_dentry, ovl_upper_mnt_idmap(ofs), dir, new_dentry, NULL); pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; } static inline int ovl_do_create(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { int err = vfs_create(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, true); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } static inline int ovl_do_mkdir(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode) { int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } static inline int ovl_do_mknod(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { int err = vfs_mknod(ovl_upper_mnt_idmap(ofs), dir, dentry, mode, dev); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; } static inline int ovl_do_symlink(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry, const char *oldname) { int err = vfs_symlink(ovl_upper_mnt_idmap(ofs), dir, dentry, oldname); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; } static inline ssize_t ovl_do_getxattr(const struct path *path, const char *name, void *value, size_t size) { int err, len; WARN_ON(path->dentry->d_sb != path->mnt->mnt_sb); err = vfs_getxattr(mnt_idmap(path->mnt), path->dentry, name, value, size); len = (value && err > 0) ? err : 0; pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", path->dentry, name, min(len, 48), value, size, err); return err; } static inline ssize_t ovl_getxattr_upper(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, void *value, size_t size) { struct path upperpath = { .dentry = upperdentry, .mnt = ovl_upper_mnt(ofs), }; return ovl_do_getxattr(&upperpath, ovl_xattr(ofs, ox), value, size); } static inline ssize_t ovl_path_getxattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox, void *value, size_t size) { return ovl_do_getxattr(path, ovl_xattr(ofs, ox), value, size); } static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { int err = vfs_setxattr(ovl_upper_mnt_idmap(ofs), dentry, name, value, size, flags); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n", dentry, name, min((int)size, 48), value, size, flags, err); return err; } static inline int ovl_setxattr(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, const void *value, size_t size) { return ovl_do_setxattr(ofs, dentry, ovl_xattr(ofs, ox), value, size, 0); } static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, const char *name) { int err = vfs_removexattr(ovl_upper_mnt_idmap(ofs), dentry, name); pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); return err; } static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox) { return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox)); } static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry, const char *acl_name, struct posix_acl *acl) { return vfs_set_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name, acl); } static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry, const char *acl_name) { return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name); } static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir, struct dentry *olddentry, struct inode *newdir, struct dentry *newdentry, unsigned int flags) { int err; struct renamedata rd = { .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_dir = olddir, .old_dentry = olddentry, .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), .new_dir = newdir, .new_dentry = newdentry, .flags = flags, }; pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); err = vfs_rename(&rd); if (err) { pr_debug("...rename(%pd2, %pd2, ...) = %i\n", olddentry, newdentry, err); } return err; } static inline int ovl_do_whiteout(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry) { int err = vfs_whiteout(ovl_upper_mnt_idmap(ofs), dir, dentry); pr_debug("whiteout(%pd2) = %i\n", dentry, err); return err; } static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; struct file *file = kernel_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode, O_LARGEFILE | O_WRONLY, current_cred()); int err = PTR_ERR_OR_ZERO(file); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); return file; } static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, const char *name, struct dentry *base, int len) { return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len); } static inline bool ovl_open_flags_need_copy_up(int flags) { if (!flags) return false; return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); } /* util.c */ int ovl_get_write_access(struct dentry *dentry); void ovl_put_write_access(struct dentry *dentry); void ovl_start_write(struct dentry *dentry); void ovl_end_write(struct dentry *dentry); int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); void ovl_revert_creds(const struct cred *old_cred); static inline const struct cred *ovl_creds(struct super_block *sb) { return OVL_FS(sb)->creator_cred; } int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); bool ovl_verify_lower(struct super_block *sb); struct ovl_path *ovl_stack_alloc(unsigned int n); void ovl_stack_cpy(struct ovl_path *dst, struct ovl_path *src, unsigned int n); void ovl_stack_put(struct ovl_path *stack, unsigned int n); void ovl_stack_free(struct ovl_path *stack, unsigned int n); struct ovl_entry *ovl_alloc_entry(unsigned int numlower); void ovl_free_entry(struct ovl_entry *oe); bool ovl_dentry_remote(struct dentry *dentry); void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry); void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry, struct ovl_entry *oe); void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry, struct ovl_entry *oe, unsigned int mask); bool ovl_dentry_weird(struct dentry *dentry); enum ovl_path_type ovl_path_type(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); void ovl_path_lowerdata(struct dentry *dentry, struct path *path); struct inode *ovl_i_path_real(struct inode *inode, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); int ovl_dentry_set_lowerdata(struct dentry *dentry, struct ovl_path *datapath); const struct ovl_layer *ovl_i_layer_lower(struct inode *inode); const struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); struct inode *ovl_inode_lower(struct inode *inode); struct inode *ovl_inode_lowerdata(struct inode *inode); struct inode *ovl_inode_real(struct inode *inode); struct inode *ovl_inode_realdata(struct inode *inode); const char *ovl_lowerdata_redirect(struct inode *inode); struct ovl_dir_cache *ovl_dir_cache(struct inode *inode); void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache); void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry); void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry); bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); bool ovl_dentry_has_xwhiteouts(struct dentry *dentry); void ovl_dentry_set_xwhiteouts(struct dentry *dentry); void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs, const struct ovl_layer *layer); bool ovl_dentry_has_upper_alias(struct dentry *dentry); void ovl_dentry_set_upper_alias(struct dentry *dentry); bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags); bool ovl_has_upperdata(struct inode *inode); void ovl_set_upperdata(struct inode *inode); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); u64 ovl_inode_version_get(struct inode *inode); bool ovl_is_whiteout(struct dentry *dentry); bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path); struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); bool ovl_already_copied_up(struct dentry *dentry, int flags); char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox); bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path); bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, const struct path *upperpath); static inline bool ovl_upper_is_whiteout(struct ovl_fs *ofs, struct dentry *upperdentry) { struct path upperpath = { .dentry = upperdentry, .mnt = ovl_upper_mnt(ofs), }; return ovl_path_is_whiteout(ofs, &upperpath); } static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *upperdentry) { struct path upperpath = { .dentry = upperdentry, .mnt = ovl_upper_mnt(ofs), }; return ovl_path_check_origin_xattr(ofs, &upperpath); } int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, const void *value, size_t size, int xerr); int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); bool ovl_is_inuse(struct dentry *dentry); bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry); void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path, struct ovl_metacopy *data); int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, struct ovl_metacopy *metacopy); bool ovl_is_meta