Total coverage: 185402 (14%)of 1373081
107 97 95 95 34 34 108 34 32 2 129 129 35 34 35 2 3 135 12 70 135 218 218 140 198 138 218 6 135 136 1 136 80 80 77 70 80 127 128 125 128 79 63 17 17 12 9 8 8 7 2 2 6 1 1 5 1 1 4 1 3 2 1 11 11 83 82 83 131 2 130 1 129 129 129 129 129 129 207 107 32 97 108 17 195 195 6 194 4 193 188 66 27 27 83 52 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ #include <linux/bpf.h> #include "disasm.h" #define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) static const char * const func_id_str[] = { __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) }; #undef __BPF_FUNC_STR_FN static const char *__func_get_name(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, char *buff, size_t len) { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); if (!insn->src_reg && insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && func_id_str[insn->imm]) return func_id_str[insn->imm]; if (cbs && cbs->cb_call) { const char *res; res = cbs->cb_call(cbs->private_data, insn); if (res) return res; } if (insn->src_reg == BPF_PSEUDO_CALL) snprintf(buff, len, "%+d", insn->imm); else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) snprintf(buff, len, "kernel-function"); return buff; } static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, u64 full_imm, char *buff, size_t len) { if (cbs && cbs->cb_imm) return cbs->cb_imm(cbs->private_data, insn, full_imm); snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); return buff; } const char *func_id_name(int id) { if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) return func_id_str[id]; else return "unknown"; } const char *const bpf_class_string[8] = { [BPF_LD] = "ld", [BPF_LDX] = "ldx", [BPF_ST] = "st", [BPF_STX] = "stx", [BPF_ALU] = "alu", [BPF_JMP] = "jmp", [BPF_JMP32] = "jmp32", [BPF_ALU64] = "alu64", }; const char *const bpf_alu_string[16] = { [BPF_ADD >> 4] = "+=", [BPF_SUB >> 4] = "-=", [BPF_MUL >> 4] = "*=", [BPF_DIV >> 4] = "/=", [BPF_OR >> 4] = "|=", [BPF_AND >> 4] = "&=", [BPF_LSH >> 4] = "<<=", [BPF_RSH >> 4] = ">>=", [BPF_NEG >> 4] = "neg", [BPF_MOD >> 4] = "%=", [BPF_XOR >> 4] = "^=", [BPF_MOV >> 4] = "=", [BPF_ARSH >> 4] = "s>>=", [BPF_END >> 4] = "endian", }; static const char *const bpf_alu_sign_string[16] = { [BPF_DIV >> 4] = "s/=", [BPF_MOD >> 4] = "s%=", }; static const char *const bpf_movsx_string[4] = { [0] = "(s8)", [1] = "(s16)", [3] = "(s32)", }; static const char *const bpf_atomic_alu_string[16] = { [BPF_ADD >> 4] = "add", [BPF_AND >> 4] = "and", [BPF_OR >> 4] = "or", [BPF_XOR >> 4] = "xor", }; static const char *const bpf_ldst_string[] = { [BPF_W >> 3] = "u32", [BPF_H >> 3] = "u16", [BPF_B >> 3] = "u8", [BPF_DW >> 3] = "u64", }; static const char *const bpf_ldsx_string[] = { [BPF_W >> 3] = "s32", [BPF_H >> 3] = "s16", [BPF_B >> 3] = "s8", }; static const char *const bpf_jmp_string[16] = { [BPF_JA >> 4] = "jmp", [BPF_JEQ >> 4] = "==", [BPF_JGT >> 4] = ">", [BPF_JLT >> 4] = "<", [BPF_JGE >> 4] = ">=", [BPF_JLE >> 4] = "<=", [BPF_JSET >> 4] = "&", [BPF_JNE >> 4] = "!=", [BPF_JSGT >> 4] = "s>", [BPF_JSLT >> 4] = "s<", [BPF_JSGE >> 4] = "s>=", [BPF_JSLE >> 4] = "s<=", [BPF_CALL >> 4] = "call", [BPF_EXIT >> 4] = "exit", }; static void print_bpf_end_insn(bpf_insn_print_t verbose, void *private_data, const struct bpf_insn *insn) { verbose(private_data, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } static void print_bpf_bswap_insn(bpf_insn_print_t verbose, void *private_data, const struct bpf_insn *insn) { verbose(private_data, "(%02x) r%d = bswap%d r%d\n", insn->code, insn->dst_reg, insn->imm, insn->dst_reg); } static bool is_sdiv_smod(const struct bpf_insn *insn) { return (BPF_OP(insn->code) == BPF_DIV || BPF_OP(insn->code) == BPF_MOD) && insn->off == 1; } static bool is_movsx(const struct bpf_insn *insn) { return BPF_OP(insn->code) == BPF_MOV && (insn->off == 8 || insn->off == 16 || insn->off == 32); } static bool is_addr_space_cast(const struct bpf_insn *insn) { return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_SPACE_CAST; } /* Special (internal-only) form of mov, used to resolve per-CPU addrs: * dst_reg = src_reg + <percpu_base_off> * BPF_ADDR_PERCPU is used as a special insn->off value. */ #define BPF_ADDR_PERCPU (-1) static inline bool is_mov_percpu_addr(const struct bpf_insn *insn) { return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; } void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) { const bpf_insn_print_t verbose = cbs->cb_print; u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) print_bpf_bswap_insn(verbose, cbs->private_data, insn); else print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (is_addr_space_cast(insn)) { verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (is_mov_percpu_addr(insn)) { verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n", insn->code, insn->dst_reg, insn->src_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4] : bpf_alu_string[BPF_OP(insn->code) >> 4], is_movsx(insn) ? bpf_movsx_string[(insn->off >> 3) - 1] : "", class == BPF_ALU ? 'w' : 'r', insn->src_reg); } else { verbose(cbs->private_data, "(%02x) %c%d %s %d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4] : bpf_alu_string[BPF_OP(insn->code) >> 4], insn->imm); } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_ATOMIC && (insn->imm == BPF_ADD || insn->imm == BPF_AND || insn->imm == BPF_OR || insn->imm == BPF_XOR)) { verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, bpf_alu_string[BPF_OP(insn->imm) >> 4], insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && (insn->imm == (BPF_ADD | BPF_FETCH) || insn->imm == (BPF_AND | BPF_FETCH) || insn->imm == (BPF_OR | BPF_FETCH) || insn->imm == (BPF_XOR | BPF_FETCH))) { verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n", insn->code, insn->src_reg, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4], bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && insn->imm == BPF_CMPXCHG) { verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n", insn->code, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && insn->imm == BPF_XCHG) { verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n", insn->code, insn->src_reg, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && insn->imm == BPF_LOAD_ACQ) { verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && insn->imm == BPF_STORE_REL) { verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) == BPF_MEM) { verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { verbose(cbs->private_data, "(%02x) nospec\n", insn->code); } else { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); } } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); return; } verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, BPF_MODE(insn->code) == BPF_MEM ? bpf_ldst_string[BPF_SIZE(insn->code) >> 3] : bpf_ldsx_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); } else if (BPF_MODE(insn->code) == BPF_IMM && BPF_SIZE(insn->code) == BPF_DW) { /* At this point, we already made sure that the second * part of the ldimm64 insn is accessible. */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_VALUE; char tmp[64]; if (is_ptr && !allow_ptr_leaks) imm = 0; verbose(cbs->private_data, "(%02x) r%d = %s\n", insn->code, insn->dst_reg, __func_imm_name(cbs, insn, imm, tmp, sizeof(tmp))); } else { verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP32 || class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { char tmp[64]; if (insn->src_reg == BPF_PSEUDO_CALL) { verbose(cbs->private_data, "(%02x) call pc%s\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp))); } else { strcpy(tmp, "unknown"); verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp)), insn->imm); } } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_JA | BPF_X)) { verbose(cbs->private_data, "(%02x) gotox r%d\n", insn->code, insn->dst_reg); } else if (insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO) { verbose(cbs->private_data, "(%02x) may_goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP32 | BPF_JA)) { verbose(cbs->private_data, "(%02x) gotol pc%+d\n", insn->code, insn->imm); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { verbose(cbs->private_data, "(%02x) if %c%d %s %c%d goto pc%+d\n", insn->code, class == BPF_JMP32 ? 'w' : 'r', insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], class == BPF_JMP32 ? 'w' : 'r', insn->src_reg, insn->off); } else { verbose(cbs->private_data, "(%02x) if %c%d %s 0x%x goto pc%+d\n", insn->code, class == BPF_JMP32 ? 'w' : 'r', insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], (u32)insn->imm, insn->off); } } else { verbose(cbs->private_data, "(%02x) %s\n", insn->code, bpf_class_string[class]); } }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 // SPDX-License-Identifier: GPL-2.0+ /* * Copyright 2019 Hans de Goede <hdegoede@redhat.com> */ #include <linux/module.h> #include <linux/pm.h> #include <linux/usb.h> #include <drm/clients/drm_client_setup.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_atomic_state_helper.h> #include <drm/drm_connector.h> #include <drm/drm_damage_helper.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_fbdev_shmem.h> #include <drm/drm_file.h> #include <drm/drm_format_helper.h> #include <drm/drm_fourcc.h> #include <drm/drm_framebuffer.h> #include <drm/drm_gem_atomic_helper.h> #include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_modeset_helper_vtables.h> #include <drm/drm_print.h> #include <drm/drm_probe_helper.h> #include <drm/drm_simple_kms_helper.h> static bool eco_mode; module_param(eco_mode, bool, 0644); MODULE_PARM_DESC(eco_mode, "Turn on Eco mode (less bright, more silent)"); #define DRIVER_NAME "gm12u320" #define DRIVER_DESC "Grain Media GM12U320 USB projector display" #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0 /* * The DLP has an actual width of 854 pixels, but that is not a multiple * of 8, breaking things left and right, so we export a width of 848. */ #define GM12U320_USER_WIDTH 848 #define GM12U320_REAL_WIDTH 854 #define GM12U320_HEIGHT 480 #define GM12U320_BLOCK_COUNT 20 #define GM12U320_ERR(fmt, ...) \ DRM_DEV_ERROR(gm12u320->dev.dev, fmt, ##__VA_ARGS__) #define MISC_RCV_EPT 1 #define DATA_RCV_EPT 2 #define DATA_SND_EPT 3 #define MISC_SND_EPT 4 #define DATA_BLOCK_HEADER_SIZE 84 #define DATA_BLOCK_CONTENT_SIZE 64512 #define DATA_BLOCK_FOOTER_SIZE 20 #define DATA_BLOCK_SIZE (DATA_BLOCK_HEADER_SIZE + \ DATA_BLOCK_CONTENT_SIZE + \ DATA_BLOCK_FOOTER_SIZE) #define DATA_LAST_BLOCK_CONTENT_SIZE 4032 #define DATA_LAST_BLOCK_SIZE (DATA_BLOCK_HEADER_SIZE + \ DATA_LAST_BLOCK_CONTENT_SIZE + \ DATA_BLOCK_FOOTER_SIZE) #define CMD_SIZE 31 #define READ_STATUS_SIZE 13 #define MISC_VALUE_SIZE 4 #define CMD_TIMEOUT 200 #define DATA_TIMEOUT 1000 #define IDLE_TIMEOUT 2000 #define FIRST_FRAME_TIMEOUT 2000 #define MISC_REQ_GET_SET_ECO_A 0xff #define MISC_REQ_GET_SET_ECO_B 0x35 /* Windows driver does once every second, with arg d = 1, other args 0 */ #define MISC_REQ_UNKNOWN1_A 0xff #define MISC_REQ_UNKNOWN1_B 0x38 /* Windows driver does this on init, with arg a, b = 0, c = 0xa0, d = 4 */ #define MISC_REQ_UNKNOWN2_A 0xa5 #define MISC_REQ_UNKNOWN2_B 0x00 struct gm12u320_device { struct drm_device dev; struct drm_simple_display_pipe pipe; struct drm_connector conn; unsigned char *cmd_buf; unsigned char *data_buf[GM12U320_BLOCK_COUNT]; struct { struct delayed_work work; struct mutex lock; struct drm_framebuffer *fb; struct drm_rect rect; int frame; int draw_status_timeout; struct iosys_map src_map; } fb_update; }; #define to_gm12u320(__dev) container_of(__dev, struct gm12u320_device, dev) static const char cmd_data[CMD_SIZE] = { 0x55, 0x53, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x68, 0xfc, 0x00, 0x00, 0x00, 0x00, 0x10, 0xff, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const char cmd_draw[CMD_SIZE] = { 0x55, 0x53, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0xfe, 0x00, 0x00, 0x00, 0xc0, 0xd1, 0x05, 0x00, 0x40, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const char cmd_misc[CMD_SIZE] = { 0x55, 0x53, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x80, 0x01, 0x10, 0xfd, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const char data_block_header[DATA_BLOCK_HEADER_SIZE] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfb, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x15, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x01, 0x00, 0x00, 0xdb }; static const char data_last_block_header[DATA_BLOCK_HEADER_SIZE] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfb, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x20, 0x00, 0xc0, 0x0f, 0x00, 0x00, 0x01, 0x00, 0x00, 0xd7 }; static const char data_block_footer[DATA_BLOCK_FOOTER_SIZE] = { 0xfb, 0x14, 0x02, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x4f }; static inline struct usb_device *gm12u320_to_usb_device(struct gm12u320_device *gm12u320) { return interface_to_usbdev(to_usb_interface(gm12u320->dev.dev)); } static int gm12u320_usb_alloc(struct gm12u320_device *gm12u320) { int i, block_size; const char *hdr; gm12u320->cmd_buf = drmm_kmalloc(&gm12u320->dev, CMD_SIZE, GFP_KERNEL); if (!gm12u320->cmd_buf) return -ENOMEM; for (i = 0; i < GM12U320_BLOCK_COUNT; i++) { if (i == GM12U320_BLOCK_COUNT - 1) { block_size = DATA_LAST_BLOCK_SIZE; hdr = data_last_block_header; } else { block_size = DATA_BLOCK_SIZE; hdr = data_block_header; } gm12u320->data_buf[i] = drmm_kzalloc(&gm12u320->dev, block_size, GFP_KERNEL); if (!gm12u320->data_buf[i]) return -ENOMEM; memcpy(gm12u320->data_buf[i], hdr, DATA_BLOCK_HEADER_SIZE); memcpy(gm12u320->data_buf[i] + (block_size - DATA_BLOCK_FOOTER_SIZE), data_block_footer, DATA_BLOCK_FOOTER_SIZE); } return 0; } static int gm12u320_misc_request(struct gm12u320_device *gm12u320, u8 req_a, u8 req_b, u8 arg_a, u8 arg_b, u8 arg_c, u8 arg_d) { struct usb_device *udev = gm12u320_to_usb_device(gm12u320); int ret, len; memcpy(gm12u320->cmd_buf, &cmd_misc, CMD_SIZE); gm12u320->cmd_buf[20] = req_a; gm12u320->cmd_buf[21] = req_b; gm12u320->cmd_buf[22] = arg_a; gm12u320->cmd_buf[23] = arg_b; gm12u320->cmd_buf[24] = arg_c; gm12u320->cmd_buf[25] = arg_d; /* Send request */ ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, MISC_SND_EPT), gm12u320->cmd_buf, CMD_SIZE, &len, CMD_TIMEOUT); if (ret || len != CMD_SIZE) { GM12U320_ERR("Misc. req. error %d\n", ret); return -EIO; } /* Read value */ ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, MISC_RCV_EPT), gm12u320->cmd_buf, MISC_VALUE_SIZE, &len, DATA_TIMEOUT); if (ret || len != MISC_VALUE_SIZE) { GM12U320_ERR("Misc. value error %d\n", ret); return -EIO; } /* cmd_buf[0] now contains the read value, which we don't use */ /* Read status */ ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, MISC_RCV_EPT), gm12u320->cmd_buf, READ_STATUS_SIZE, &len, CMD_TIMEOUT); if (ret || len != READ_STATUS_SIZE) { GM12U320_ERR("Misc. status error %d\n", ret); return -EIO; } return 0; } static void gm12u320_32bpp_to_24bpp_packed(u8 *dst, u8 *src, int len) { while (len--) { *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; src++; } } static void gm12u320_copy_fb_to_blocks(struct gm12u320_device *gm12u320) { int block, dst_offset, len, remain, ret, x1, x2, y1, y2; struct drm_framebuffer *fb; void *vaddr; u8 *src; mutex_lock(&gm12u320->fb_update.lock); if (!gm12u320->fb_update.fb) goto unlock; fb = gm12u320->fb_update.fb; x1 = gm12u320->fb_update.rect.x1; x2 = gm12u320->fb_update.rect.x2; y1 = gm12u320->fb_update.rect.y1; y2 = gm12u320->fb_update.rect.y2; vaddr = gm12u320->fb_update.src_map.vaddr; /* TODO: Use mapping abstraction properly */ ret = drm_gem_fb_begin_cpu_access(fb, DMA_FROM_DEVICE); if (ret) { GM12U320_ERR("drm_gem_fb_begin_cpu_access err: %d\n", ret); goto put_fb; } src = vaddr + y1 * fb->pitches[0] + x1 * 4; x1 += (GM12U320_REAL_WIDTH - GM12U320_USER_WIDTH) / 2; x2 += (GM12U320_REAL_WIDTH - GM12U320_USER_WIDTH) / 2; for (; y1 < y2; y1++) { remain = 0; len = (x2 - x1) * 3; dst_offset = (y1 * GM12U320_REAL_WIDTH + x1) * 3; block = dst_offset / DATA_BLOCK_CONTENT_SIZE; dst_offset %= DATA_BLOCK_CONTENT_SIZE; if ((dst_offset + len) > DATA_BLOCK_CONTENT_SIZE) { remain = dst_offset + len - DATA_BLOCK_CONTENT_SIZE; len = DATA_BLOCK_CONTENT_SIZE - dst_offset; } dst_offset += DATA_BLOCK_HEADER_SIZE; len /= 3; gm12u320_32bpp_to_24bpp_packed( gm12u320->data_buf[block] + dst_offset, src, len); if (remain) { block++; dst_offset = DATA_BLOCK_HEADER_SIZE; gm12u320_32bpp_to_24bpp_packed( gm12u320->data_buf[block] + dst_offset, src + len * 4, remain / 3); } src += fb->pitches[0]; } drm_gem_fb_end_cpu_access(fb, DMA_FROM_DEVICE); put_fb: drm_framebuffer_put(fb); gm12u320->fb_update.fb = NULL; unlock: mutex_unlock(&gm12u320->fb_update.lock); } static void gm12u320_fb_update_work(struct work_struct *work) { struct gm12u320_device *gm12u320 = container_of(to_delayed_work(work), struct gm12u320_device, fb_update.work); struct usb_device *udev = gm12u320_to_usb_device(gm12u320); int block, block_size, len; int ret = 0; gm12u320_copy_fb_to_blocks(gm12u320); for (block = 0; block < GM12U320_BLOCK_COUNT; block++) { if (block == GM12U320_BLOCK_COUNT - 1) block_size = DATA_LAST_BLOCK_SIZE; else block_size = DATA_BLOCK_SIZE; /* Send data command to device */ memcpy(gm12u320->cmd_buf, cmd_data, CMD_SIZE); gm12u320->cmd_buf[8] = block_size & 0xff; gm12u320->cmd_buf[9] = block_size >> 8; gm12u320->cmd_buf[20] = 0xfc - block * 4; gm12u320->cmd_buf[21] = block | (gm12u320->fb_update.frame << 7); ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, DATA_SND_EPT), gm12u320->cmd_buf, CMD_SIZE, &len, CMD_TIMEOUT); if (ret || len != CMD_SIZE) goto err; /* Send data block to device */ ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, DATA_SND_EPT), gm12u320->data_buf[block], block_size, &len, DATA_TIMEOUT); if (ret || len != block_size) goto err; /* Read status */ ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, DATA_RCV_EPT), gm12u320->cmd_buf, READ_STATUS_SIZE, &len, CMD_TIMEOUT); if (ret || len != READ_STATUS_SIZE) goto err; } /* Send draw command to device */ memcpy(gm12u320->cmd_buf, cmd_draw, CMD_SIZE); ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, DATA_SND_EPT), gm12u320->cmd_buf, CMD_SIZE, &len, CMD_TIMEOUT); if (ret || len != CMD_SIZE) goto err; /* Read status */ ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, DATA_RCV_EPT), gm12u320->cmd_buf, READ_STATUS_SIZE, &len, gm12u320->fb_update.draw_status_timeout); if (ret || len != READ_STATUS_SIZE) goto err; gm12u320->fb_update.draw_status_timeout = CMD_TIMEOUT; gm12u320->fb_update.frame = !gm12u320->fb_update.frame; /* * We must draw a frame every 2s otherwise the projector * switches back to showing its logo. */ queue_delayed_work(system_long_wq, &gm12u320->fb_update.work, msecs_to_jiffies(IDLE_TIMEOUT)); return; err: /* Do not log errors caused by module unload or device unplug */ if (ret != -ENODEV && ret != -ECONNRESET && ret != -ESHUTDOWN) GM12U320_ERR("Frame update error: %d\n", ret); } static void gm12u320_fb_mark_dirty(struct drm_framebuffer *fb, const struct iosys_map *map, struct drm_rect *dirty) { struct gm12u320_device *gm12u320 = to_gm12u320(fb->dev); struct drm_framebuffer *old_fb = NULL; bool wakeup = false; mutex_lock(&gm12u320->fb_update.lock); if (gm12u320->fb_update.fb != fb) { old_fb = gm12u320->fb_update.fb; drm_framebuffer_get(fb); gm12u320->fb_update.fb = fb; gm12u320->fb_update.rect = *dirty; gm12u320->fb_update.src_map = *map; wakeup = true; } else { struct drm_rect *rect = &gm12u320->fb_update.rect; rect->x1 = min(rect->x1, dirty->x1); rect->y1 = min(rect->y1, dirty->y1); rect->x2 = max(rect->x2, dirty->x2); rect->y2 = max(rect->y2, dirty->y2); } mutex_unlock(&gm12u320->fb_update.lock); if (wakeup) mod_delayed_work(system_long_wq, &gm12u320->fb_update.work, 0); if (old_fb) drm_framebuffer_put(old_fb); } static void gm12u320_stop_fb_update(struct gm12u320_device *gm12u320) { struct drm_framebuffer *old_fb; cancel_delayed_work_sync(&gm12u320->fb_update.work); mutex_lock(&gm12u320->fb_update.lock); old_fb = gm12u320->fb_update.fb; gm12u320->fb_update.fb = NULL; iosys_map_clear(&gm12u320->fb_update.src_map); mutex_unlock(&gm12u320->fb_update.lock); drm_framebuffer_put(old_fb); } static int gm12u320_set_ecomode(struct gm12u320_device *gm12u320) { return gm12u320_misc_request(gm12u320, MISC_REQ_GET_SET_ECO_A, MISC_REQ_GET_SET_ECO_B, 0x01 /* set */, eco_mode ? 0x01 : 0x00, 0x00, 0x01); } /* ------------------------------------------------------------------ */ /* gm12u320 connector */ /* * We use fake EDID info so that userspace know that it is dealing with * an Acer projector, rather then listing this as an "unknown" monitor. * Note this assumes this driver is only ever used with the Acer C120, if we * add support for other devices the vendor and model should be parameterized. */ static const struct edid gm12u320_edid = { .header = { 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }, .mfg_id = { 0x04, 0x72 }, /* "ACR" */ .prod_code = { 0x20, 0xc1 }, /* C120h */ .serial = 0xaa55aa55, .mfg_week = 1, .mfg_year = 16, .version = 1, /* EDID 1.3 */ .revision = 3, /* EDID 1.3 */ .input = 0x08, /* Analog input */ .features = 0x0a, /* Pref timing in DTD 1 */ .standard_timings = { { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 } }, .detailed_timings = { { .pixel_clock = 3383, /* hactive = 848, hblank = 256 */ .data.pixel_data.hactive_lo = 0x50, .data.pixel_data.hblank_lo = 0x00, .data.pixel_data.hactive_hblank_hi = 0x31, /* vactive = 480, vblank = 28 */ .data.pixel_data.vactive_lo = 0xe0, .data.pixel_data.vblank_lo = 0x1c, .data.pixel_data.vactive_vblank_hi = 0x10, /* hsync offset 40 pw 128, vsync offset 1 pw 4 */ .data.pixel_data.hsync_offset_lo = 0x28, .data.pixel_data.hsync_pulse_width_lo = 0x80, .data.pixel_data.vsync_offset_pulse_width_lo = 0x14, .data.pixel_data.hsync_vsync_offset_pulse_width_hi = 0x00, /* Digital separate syncs, hsync+, vsync+ */ .data.pixel_data.misc = 0x1e, }, { .pixel_clock = 0, .data.other_data.type = 0xfd, /* Monitor ranges */ .data.other_data.data.range.min_vfreq = 59, .data.other_data.data.range.max_vfreq = 61, .data.other_data.data.range.min_hfreq_khz = 29, .data.other_data.data.range.max_hfreq_khz = 32, .data.other_data.data.range.pixel_clock_mhz = 4, /* 40 MHz */ .data.other_data.data.range.flags = 0, .data.other_data.data.range.formula.cvt = { 0xa0, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 }, }, { .pixel_clock = 0, .data.other_data.type = 0xfc, /* Model string */ .data.other_data.data.str.str = { 'P', 'r', 'o', 'j', 'e', 'c', 't', 'o', 'r', '\n', ' ', ' ', ' ' }, }, { .pixel_clock = 0, .data.other_data.type = 0xfe, /* Unspecified text / padding */ .data.other_data.data.str.str = { '\n', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' }, } }, .checksum = 0x13, }; static int gm12u320_conn_get_modes(struct drm_connector *connector) { const struct drm_edid *drm_edid; int count; drm_edid = drm_edid_alloc(&gm12u320_edid, sizeof(gm12u320_edid)); drm_edid_connector_update(connector, drm_edid); count = drm_edid_connector_add_modes(connector); drm_edid_free(drm_edid); return count; } static const struct drm_connector_helper_funcs gm12u320_conn_helper_funcs = { .get_modes = gm12u320_conn_get_modes, }; static const struct drm_connector_funcs gm12u320_conn_funcs = { .fill_modes = drm_helper_probe_single_connector_modes, .destroy = drm_connector_cleanup, .reset = drm_atomic_helper_connector_reset, .atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state, .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, }; static int gm12u320_conn_init(struct gm12u320_device *gm12u320) { drm_connector_helper_add(&gm12u320->conn, &gm12u320_conn_helper_funcs); return drm_connector_init(&gm12u320->dev, &gm12u320->conn, &gm12u320_conn_funcs, DRM_MODE_CONNECTOR_VGA); } /* ------------------------------------------------------------------ */ /* gm12u320 (simple) display pipe */ static void gm12u320_pipe_enable(struct drm_simple_display_pipe *pipe, struct drm_crtc_state *crtc_state, struct drm_plane_state *plane_state) { struct drm_rect rect = { 0, 0, GM12U320_USER_WIDTH, GM12U320_HEIGHT }; struct gm12u320_device *gm12u320 = to_gm12u320(pipe->crtc.dev); struct drm_shadow_plane_state *shadow_plane_state = to_drm_shadow_plane_state(plane_state); gm12u320->fb_update.draw_status_timeout = FIRST_FRAME_TIMEOUT; gm12u320_fb_mark_dirty(plane_state->fb, &shadow_plane_state->data[0], &rect); } static void gm12u320_pipe_disable(struct drm_simple_display_pipe *pipe) { struct gm12u320_device *gm12u320 = to_gm12u320(pipe->crtc.dev); gm12u320_stop_fb_update(gm12u320); } static void gm12u320_pipe_update(struct drm_simple_display_pipe *pipe, struct drm_plane_state *old_state) { struct drm_plane_state *state = pipe->plane.state; struct drm_shadow_plane_state *shadow_plane_state = to_drm_shadow_plane_state(state); struct drm_rect rect; if (drm_atomic_helper_damage_merged(old_state, state, &rect)) gm12u320_fb_mark_dirty(state->fb, &shadow_plane_state->data[0], &rect); } static const struct drm_simple_display_pipe_funcs gm12u320_pipe_funcs = { .enable = gm12u320_pipe_enable, .disable = gm12u320_pipe_disable, .update = gm12u320_pipe_update, DRM_GEM_SIMPLE_DISPLAY_PIPE_SHADOW_PLANE_FUNCS, }; static const uint32_t gm12u320_pipe_formats[] = { DRM_FORMAT_XRGB8888, }; static const uint64_t gm12u320_pipe_modifiers[] = { DRM_FORMAT_MOD_LINEAR, DRM_FORMAT_MOD_INVALID }; DEFINE_DRM_GEM_FOPS(gm12u320_fops); static const struct drm_driver gm12u320_drm_driver = { .driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_ATOMIC, .name = DRIVER_NAME, .desc = DRIVER_DESC, .major = DRIVER_MAJOR, .minor = DRIVER_MINOR, .fops = &gm12u320_fops, DRM_GEM_SHMEM_DRIVER_OPS, DRM_FBDEV_SHMEM_DRIVER_OPS, }; static const struct drm_mode_config_funcs gm12u320_mode_config_funcs = { .fb_create = drm_gem_fb_create_with_dirty, .atomic_check = drm_atomic_helper_check, .atomic_commit = drm_atomic_helper_commit, }; static int gm12u320_usb_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct gm12u320_device *gm12u320; struct drm_device *dev; struct device *dma_dev; int ret; /* * The gm12u320 presents itself to the system as 2 usb mass-storage * interfaces, we only care about / need the first one. */ if (interface->cur_altsetting->desc.bInterfaceNumber != 0) return -ENODEV; gm12u320 = devm_drm_dev_alloc(&interface->dev, &gm12u320_drm_driver, struct gm12u320_device, dev); if (IS_ERR(gm12u320)) return PTR_ERR(gm12u320); dev = &gm12u320->dev; dma_dev = usb_intf_get_dma_device(interface); if (dma_dev) { drm_dev_set_dma_dev(dev, dma_dev); put_device(dma_dev); } else { drm_warn(dev, "buffer sharing not supported"); /* not an error */ } INIT_DELAYED_WORK(&gm12u320->fb_update.work, gm12u320_fb_update_work); mutex_init(&gm12u320->fb_update.lock); ret = drmm_mode_config_init(dev); if (ret) return ret; dev->mode_config.min_width = GM12U320_USER_WIDTH; dev->mode_config.max_width = GM12U320_USER_WIDTH; dev->mode_config.min_height = GM12U320_HEIGHT; dev->mode_config.max_height = GM12U320_HEIGHT; dev->mode_config.funcs = &gm12u320_mode_config_funcs; ret = gm12u320_usb_alloc(gm12u320); if (ret) return ret; ret = gm12u320_set_ecomode(gm12u320); if (ret) return ret; ret = gm12u320_conn_init(gm12u320); if (ret) return ret; ret = drm_simple_display_pipe_init(&gm12u320->dev, &gm12u320->pipe, &gm12u320_pipe_funcs, gm12u320_pipe_formats, ARRAY_SIZE(gm12u320_pipe_formats), gm12u320_pipe_modifiers, &gm12u320->conn); if (ret) return ret; drm_mode_config_reset(dev); usb_set_intfdata(interface, dev); ret = drm_dev_register(dev, 0); if (ret) return ret; drm_client_setup(dev, NULL); return 0; } static void gm12u320_usb_disconnect(struct usb_interface *interface) { struct drm_device *dev = usb_get_intfdata(interface); drm_dev_unplug(dev); drm_atomic_helper_shutdown(dev); } static int gm12u320_suspend(struct usb_interface *interface, pm_message_t message) { struct drm_device *dev = usb_get_intfdata(interface); return drm_mode_config_helper_suspend(dev); } static int gm12u320_resume(struct usb_interface *interface) { struct drm_device *dev = usb_get_intfdata(interface); struct gm12u320_device *gm12u320 = to_gm12u320(dev); gm12u320_set_ecomode(gm12u320); return drm_mode_config_helper_resume(dev); } static const struct usb_device_id id_table[] = { { USB_DEVICE(0x1de1, 0xc102) }, {}, }; MODULE_DEVICE_TABLE(usb, id_table); static struct usb_driver gm12u320_usb_driver = { .name = "gm12u320", .probe = gm12u320_usb_probe, .disconnect = gm12u320_usb_disconnect, .id_table = id_table, .suspend = pm_ptr(gm12u320_suspend), .resume = pm_ptr(gm12u320_resume), .reset_resume = pm_ptr(gm12u320_resume), }; module_usb_driver(gm12u320_usb_driver); MODULE_AUTHOR("Hans de Goede <hdegoede@redhat.com>"); MODULE_DESCRIPTION("GM12U320 driver for USB projectors"); MODULE_LICENSE("GPL");
10 1 8 2 8 8 5 5 3 3 38 16 12 3 4 4 2 1 3 13 13 13 13 13 13 2 2 12 12 11 12 11 11 10 11 11 3 2 4 84 3 3 4 4 4 4 4 10 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 /* SPDX-License-Identifier: GPL-2.0 */ /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #ifndef __MPTCP_PROTOCOL_H #define __MPTCP_PROTOCOL_H #include <linux/random.h> #include <net/tcp.h> #include <net/inet_connection_sock.h> #include <uapi/linux/mptcp.h> #include <net/genetlink.h> #include <net/rstreason.h> #define MPTCP_SUPPORTED_VERSION 1 /* MPTCP option bits */ #define OPTION_MPTCP_MPC_SYN BIT(0) #define OPTION_MPTCP_MPC_SYNACK BIT(1) #define OPTION_MPTCP_MPC_ACK BIT(2) #define OPTION_MPTCP_MPJ_SYN BIT(3) #define OPTION_MPTCP_MPJ_SYNACK BIT(4) #define OPTION_MPTCP_MPJ_ACK BIT(5) #define OPTION_MPTCP_ADD_ADDR BIT(6) #define OPTION_MPTCP_RM_ADDR BIT(7) #define OPTION_MPTCP_FASTCLOSE BIT(8) #define OPTION_MPTCP_PRIO BIT(9) #define OPTION_MPTCP_RST BIT(10) #define OPTION_MPTCP_DSS BIT(11) #define OPTION_MPTCP_FAIL BIT(12) #define OPTION_MPTCP_CSUMREQD BIT(13) #define OPTIONS_MPTCP_MPC (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | \ OPTION_MPTCP_MPC_ACK) #define OPTIONS_MPTCP_MPJ (OPTION_MPTCP_MPJ_SYN | OPTION_MPTCP_MPJ_SYNACK | \ OPTION_MPTCP_MPJ_ACK) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 #define MPTCPOPT_MP_JOIN 1 #define MPTCPOPT_DSS 2 #define MPTCPOPT_ADD_ADDR 3 #define MPTCPOPT_RM_ADDR 4 #define MPTCPOPT_MP_PRIO 5 #define MPTCPOPT_MP_FAIL 6 #define MPTCPOPT_MP_FASTCLOSE 7 #define MPTCPOPT_RST 8 /* MPTCP suboption lengths */ #define TCPOLEN_MPTCP_MPC_SYN 4 #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 #define TCPOLEN_MPTCP_MPC_ACK_DATA 22 #define TCPOLEN_MPTCP_MPJ_SYN 12 #define TCPOLEN_MPTCP_MPJ_SYNACK 16 #define TCPOLEN_MPTCP_MPJ_ACK 24 #define TCPOLEN_MPTCP_DSS_BASE 4 #define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 #define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 #define TCPOLEN_MPTCP_ADD_ADDR 16 #define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 #define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 #define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 #define TCPOLEN_MPTCP_ADD_ADDR6 28 #define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 #define TCPOLEN_MPTCP_PORT_LEN 2 #define TCPOLEN_MPTCP_PORT_ALIGN 2 #define TCPOLEN_MPTCP_RM_ADDR_BASE 3 #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 #define TCPOLEN_MPTCP_FAIL 12 #define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) #define MPTCPOPT_THMAC_LEN 8 /* MPTCP MP_CAPABLE flags */ #define MPTCP_VERSION_MASK (0x0F) #define MPTCP_CAP_CHECKSUM_REQD BIT(7) #define MPTCP_CAP_EXTENSIBILITY BIT(6) #define MPTCP_CAP_DENY_JOIN_ID0 BIT(5) #define MPTCP_CAP_HMAC_SHA256 BIT(0) #define MPTCP_CAP_FLAG_MASK (0x1F) /* MPTCP DSS flags */ #define MPTCP_DSS_DATA_FIN BIT(4) #define MPTCP_DSS_DSN64 BIT(3) #define MPTCP_DSS_HAS_MAP BIT(2) #define MPTCP_DSS_ACK64 BIT(1) #define MPTCP_DSS_HAS_ACK BIT(0) #define MPTCP_DSS_FLAG_MASK (0x1F) /* MPTCP ADD_ADDR flags */ #define MPTCP_ADDR_ECHO BIT(0) /* MPTCP MP_PRIO flags */ #define MPTCP_PRIO_BKUP BIT(0) /* MPTCP TCPRST flags */ #define MPTCP_RST_TRANSIENT BIT(0) /* MPTCP socket atomic flags */ #define MPTCP_WORK_RTX 1 #define MPTCP_FALLBACK_DONE 2 #define MPTCP_WORK_CLOSE_SUBFLOW 3 /* MPTCP socket release cb flags */ #define MPTCP_PUSH_PENDING 1 #define MPTCP_CLEAN_UNA 2 #define MPTCP_ERROR_REPORT 3 #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; u8 has_rxtstamp; u8 cant_coalesce; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) static inline bool before64(__u64 seq1, __u64 seq2) { return (__s64)(seq1 - seq2) < 0; } #define after64(seq2, seq1) before64(seq1, seq2) struct mptcp_options_received { u64 sndr_key; u64 rcvr_key; u64 data_ack; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; struct_group(status, u16 suboptions; u16 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, reset_reason:4, reset_transient:1, echo:1, backup:1, deny_join_id0:1, __unused:2; ); u8 join_id; u32 token; u32 nonce; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; struct mptcp_rm_list rm_list; u64 ahmac; u64 fail_seq; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) { return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) | ((nib & 0xF) << 8) | field); } enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, MPTCP_PM_ADD_ADDR_SEND_ACK, MPTCP_PM_RM_ADDR_RECEIVED, MPTCP_PM_ESTABLISHED, MPTCP_PM_SUBFLOW_ESTABLISHED, MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is * accounted int id_avail_bitmap */ }; enum mptcp_pm_type { MPTCP_PM_TYPE_KERNEL = 0, MPTCP_PM_TYPE_USERSPACE, __MPTCP_PM_TYPE_NR, __MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1, }; /* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */ #define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1) enum mptcp_addr_signal_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, MPTCP_RM_ADDR_SIGNAL, }; /* max value of mptcp_addr_info.id */ #define MPTCP_PM_MAX_ADDR_ID U8_MAX struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; struct list_head anno_list; struct list_head userspace_pm_local_addr_list; spinlock_t lock; /*protects the whole PM data */ struct_group(reset, u8 addr_signal; bool server_side; bool work_pending; bool accept_addr; bool accept_subflow; bool remote_deny_join_id0; u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; u8 extra_subflows; u8 status; ); DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_rm_list rm_list_tx; struct mptcp_rm_list rm_list_rx; }; struct mptcp_pm_local { struct mptcp_addr_info addr; u8 flags; int ifindex; }; struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; u8 flags; int ifindex; struct socket *lsk; }; struct mptcp_data_frag { struct list_head list; u64 data_seq; u16 data_len; u16 offset; u16 overhead; u16 already_sent; struct page *page; }; /* MPTCP connection sock */ struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; u64 local_key; /* protected by the first subflow socket lock * lockless access read */ u64 remote_key; /* same as above */ u64 write_seq; u64 bytes_sent; u64 snd_nxt; u64 bytes_received; u64 ack_seq; atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; u64 bytes_retrans; u64 bytes_consumed; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; * recovery related fields are under data_lock * protection */ u64 bytes_acked; u64 snd_una; u64 wnd_end; u32 last_data_sent; u32 last_data_recv; u32 last_ack_recv; unsigned long timer_ival; u32 token; unsigned long flags; unsigned long cb_flags; bool recovery; /* closing subflow write queue reinjected */ bool can_ack; bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; bool allow_infinite_fallback; u8 pending_state; /* A subflow asked to set this sk_state, * protected by the msk data lock */ u8 mpc_endpoint_id; u8 recvmsg_inq:1, cork:1, nodelay:1, fastopening:1, in_accept_queue:1, free_first:1, rcvspace_init:1; u32 notsent_lowat; int keepalive_cnt; int keepalive_idle; int keepalive_intvl; int maxseg; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; struct list_head join_list; struct sock *first; /* The mptcp ops can safely dereference, using suitable * ONCE annotation, the subflow outside the socket * lock as such sock is freed after close(). */ struct mptcp_pm_data pm; struct mptcp_sched_ops *sched; struct { int space; /* bytes copied in last measurement window */ int copied; /* bytes copied in this measurement window */ u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; u8 scaling_ratio; bool allow_subflows; u32 subflow_id; u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; spinlock_t fallback_lock; /* protects fallback, * allow_infinite_fallback and * allow_join */ struct list_head backlog_list; /* protected by the data lock */ u32 backlog_len; u32 backlog_unaccounted; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) #define mptcp_next_subflow(__msk, __subflow) \ list_next_entry_circular(__subflow, &((__msk)->conn_list), node) extern struct genl_family mptcp_genl_family; static inline void msk_owned_by_me(const struct mptcp_sock *msk) { sock_owned_by_me((const struct sock *)msk); } #ifdef CONFIG_DEBUG_NET /* MPTCP-specific: we might (indirectly) call this helper with the wrong sk */ #undef tcp_sk #define tcp_sk(ptr) ({ \ typeof(ptr) _ptr = (ptr); \ WARN_ON(_ptr->sk_protocol != IPPROTO_TCP); \ container_of_const(_ptr, struct tcp_sock, inet_conn.icsk_inet.sk); \ }) #define mptcp_sk(ptr) ({ \ typeof(ptr) _ptr = (ptr); \ WARN_ON(_ptr->sk_protocol != IPPROTO_MPTCP); \ container_of_const(_ptr, struct mptcp_sock, sk.icsk_inet.sk); \ }) #else /* !CONFIG_DEBUG_NET */ #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) #endif static inline int mptcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); } static inline int mptcp_space_from_win(const struct sock *sk, int win) { return __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, win); } static inline int __mptcp_space(const struct sock *sk) { return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - READ_ONCE(mptcp_sk(sk)->backlog_len) - sk_rmem_alloc_get(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); return msk->first_pending; } static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *cur; cur = msk->first_pending; return list_is_last(&cur->list, &msk->rtx_queue) ? NULL : list_next_entry(cur, list); } static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); if (!msk->first_pending) return NULL; if (WARN_ON_ONCE(list_empty(&msk->rtx_queue))) return NULL; return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); } static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (msk->snd_una == msk->snd_nxt) return NULL; return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); } struct csum_pseudo_header { __be64 data_seq; __be32 subflow_seq; __be16 data_len; __sum16 csum; }; struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, backup : 1, request_bkup : 1, csum_reqd : 1, allow_join_id0 : 1; u8 local_id; u8 remote_id; u64 local_key; u64 idsn; u32 token; u32 ssn_offset; u64 thmac; u32 local_nonce; u32 remote_nonce; struct mptcp_sock *msk; struct hlist_nulls_node token_node; }; static inline struct mptcp_subflow_request_sock * mptcp_subflow_rsk(const struct request_sock *rsk) { return (struct mptcp_subflow_request_sock *)rsk; } struct mptcp_delegated_action { struct napi_struct napi; local_lock_t bh_lock; struct list_head head; }; DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); #define MPTCP_DELEGATE_SCHEDULED 0 #define MPTCP_DELEGATE_SEND 1 #define MPTCP_DELEGATE_ACK 2 #define MPTCP_DELEGATE_SNDBUF 3 #define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED)) /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ struct_group(reset, unsigned long avg_pacing_rate; /* protected by msk socket lock */ u64 local_key; u64 remote_key; u64 idsn; u64 map_seq; u64 rcv_wnd_sent; u32 snd_isn; u32 token; u32 rel_write_seq; u32 map_subflow_seq; u32 ssn_offset; u32 map_data_len; __wsum map_data_csum; u32 map_csum_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ request_join : 1, /* send MP_JOIN */ request_bkup : 1, mp_capable : 1, /* remote is MPTCP capable */ mp_join : 1, /* remote is JOINing */ pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, map_csum_reqd : 1, map_data_fin : 1, mpc_map : 1, backup : 1, send_mp_prio : 1, send_mp_fail : 1, send_fastclose : 1, send_infinite_map : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ closing : 1, /* must not pass rx data to msk anymore */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ close_event_done : 1, /* has done the post-closed part */ mpc_drop : 1, /* the MPC option has been dropped in a rtx */ __unused : 8; bool data_avail; bool scheduled; bool pm_listener; /* a listener managed by the kernel PM? */ bool fully_established; /* path validated */ u32 lent_mem_frag; u32 remote_nonce; u64 thmac; u32 local_nonce; u32 remote_token; union { u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */ u64 iasn; /* initial ack sequence number, MPC subflows only */ }; s16 local_id; /* if negative not initialized yet */ u8 remote_id; u8 reset_seen:1; u8 reset_transient:1; u8 reset_reason:4; u8 stale_count; u32 subflow_id; long delegated_status; unsigned long fail_tout; ); struct list_head delegated_node; /* link into delegated_action, protected by local BH */ u32 setsockopt_seq; u32 stale_rcv_tstamp; int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf, * protected by the msk socket lock */ struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; void (*tcp_state_change)(struct sock *sk); void (*tcp_error_report)(struct sock *sk); struct rcu_head rcu; }; static inline struct mptcp_subflow_context * mptcp_subflow_ctx(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code */ return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data; } static inline struct sock * mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) { return subflow->tcp_sock; } static inline void mptcp_subflow_ctx_reset(struct mptcp_subflow_context *subflow) { memset(&subflow->reset, 0, sizeof(subflow->reset)); subflow->request_mptcp = 1; WRITE_ONCE(subflow->local_id, -1); } /* Convert reset reasons in MPTCP to enum sk_rst_reason type */ static inline enum sk_rst_reason sk_rst_convert_mptcp_reason(u32 reason) { switch (reason) { case MPTCP_RST_EUNSPEC: return SK_RST_REASON_MPTCP_RST_EUNSPEC; case MPTCP_RST_EMPTCP: return SK_RST_REASON_MPTCP_RST_EMPTCP; case MPTCP_RST_ERESOURCE: return SK_RST_REASON_MPTCP_RST_ERESOURCE; case MPTCP_RST_EPROHIBIT: return SK_RST_REASON_MPTCP_RST_EPROHIBIT; case MPTCP_RST_EWQ2BIG: return SK_RST_REASON_MPTCP_RST_EWQ2BIG; case MPTCP_RST_EBADPERF: return SK_RST_REASON_MPTCP_RST_EBADPERF; case MPTCP_RST_EMIDDLEBOX: return SK_RST_REASON_MPTCP_RST_EMIDDLEBOX; default: /* It should not happen, or else errors may occur * in MPTCP layer */ return SK_RST_REASON_ERROR; } } static inline void mptcp_send_active_reset_reason(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); enum sk_rst_reason reason; reason = sk_rst_convert_mptcp_reason(subflow->reset_reason); tcp_send_active_reset(sk, GFP_ATOMIC, reason); } /* Made the fwd mem carried by the given skb available to the msk, * To be paired with a previous mptcp_subflow_lend_fwdmem() before freeing * the skb or setting the skb ownership. */ static inline void mptcp_borrow_fwdmem(struct sock *sk, struct sk_buff *skb) { struct sock *ssk = skb->sk; /* The subflow just lend the skb fwd memory; if the subflow meanwhile * closed, mptcp_close_ssk() already released the ssk rcv memory. */ DEBUG_NET_WARN_ON_ONCE(skb->destructor); sk_forward_alloc_add(sk, skb->truesize); if (!ssk) return; atomic_sub(skb->truesize, &ssk->sk_rmem_alloc); skb->sk = NULL; } static inline void __mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, int size) { int frag = (subflow->lent_mem_frag + size) & (PAGE_SIZE - 1); subflow->lent_mem_frag = frag; } static inline void mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, struct sk_buff *skb) { __mptcp_subflow_lend_fwdmem(subflow, skb->truesize); skb->destructor = NULL; } static inline u64 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) { return tcp_sk(mptcp_subflow_tcp_sock(subflow))->copied_seq - subflow->ssn_offset - subflow->map_subflow_seq; } static inline u64 mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) { return subflow->map_seq + mptcp_subflow_get_map_offset(subflow); } void mptcp_subflow_process_delegated(struct sock *ssk, long actions); static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, int action) { long old, set_bits = BIT(MPTCP_DELEGATE_SCHEDULED) | BIT(action); struct mptcp_delegated_action *delegated; bool schedule; /* the caller held the subflow bh socket lock */ lockdep_assert_in_softirq(); /* The implied barrier pairs with tcp_release_cb_override() * mptcp_napi_poll(), and ensures the below list check sees list * updates done prior to delegated status bits changes */ old = set_mask_bits(&subflow->delegated_status, 0, set_bits); if (!(old & BIT(MPTCP_DELEGATE_SCHEDULED))) { if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); delegated = this_cpu_ptr(&mptcp_delegated_actions); schedule = list_empty(&delegated->head); list_add_tail(&subflow->delegated_node, &delegated->head); local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); sock_hold(mptcp_subflow_tcp_sock(subflow)); if (schedule) napi_schedule(&delegated->napi); } } static inline struct mptcp_subflow_context * mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) { struct mptcp_subflow_context *ret; local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); if (list_empty(&delegated->head)) { local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return NULL; } ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); list_del_init(&ret->delegated_node); local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return ret; } void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp); void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk); int mptcp_is_enabled(const struct net *net); unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); const char *mptcp_get_path_manager(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); void mptcp_active_disable(struct sock *sk); bool mptcp_active_should_disable(struct sock *ssk); void mptcp_active_enable(struct sock *sk); void mptcp_get_available_schedulers(char *buf, size_t maxlen); void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); void mptcp_check_and_set_pending(struct sock *sk); void __mptcp_push_pending(struct sock *sk, unsigned int flags); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk); void mptcp_set_state(struct sock *sk, int state); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr); void mptcp_remote_address(const struct sock_common *skc, struct mptcp_addr_info *addr); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock); void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); struct mptcp_sched_ops *mptcp_sched_find(const char *name); int mptcp_validate_scheduler(struct mptcp_sched_ops *sched); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); void mptcp_sched_init(void); int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched); void mptcp_release_sched(struct mptcp_sock *msk); void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, bool scheduled); struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk); struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk); int mptcp_sched_get_send(struct mptcp_sock *msk); int mptcp_sched_get_retrans(struct mptcp_sock *msk); static inline u64 mptcp_data_avail(const struct mptcp_sock *msk) { return READ_ONCE(msk->bytes_received) - READ_ONCE(msk->bytes_consumed); } static inline bool mptcp_epollin_ready(const struct sock *sk) { u64 data_avail = mptcp_data_avail(mptcp_sk(sk)); if (!data_avail) return false; /* mptcp doesn't have to deal with small skbs in the receive queue, * as it can always coalesce them */ return (data_avail >= sk->sk_rcvlowat) || tcp_under_memory_pressure(sk); } int mptcp_set_rcvlowat(struct sock *sk, int val); static inline bool __tcp_can_send(const struct sock *ssk) { /* only send if our side has not closed yet */ return ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); } static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) { /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ if (subflow->request_join && !READ_ONCE(subflow->fully_established)) return false; return __tcp_can_send(mptcp_subflow_tcp_sock(subflow)); } void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); void mptcp_subflow_drop_ctx(struct sock *ssk); static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { sk->sk_data_ready = sock_def_readable; sk->sk_state_change = ctx->tcp_state_change; sk->sk_write_space = sk_stream_write_space; sk->sk_error_report = ctx->tcp_error_report; inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcp_proto_v6_init(void); #endif struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, struct request_sock *req); void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); void __mptcp_sync_state(struct sock *sk, int state); void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout); static inline void mptcp_stop_tout_timer(struct sock *sk) { if (!inet_csk(sk)->icsk_mtup.probe_timestamp) return; sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer); inet_csk(sk)->icsk_mtup.probe_timestamp = 0; } static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout) { /* avoid 0 timestamp, as that means no close timeout */ inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1; } static inline void mptcp_start_tout_timer(struct sock *sk) { mptcp_set_close_tout(sk, tcp_jiffies32); mptcp_reset_tout_timer(mptcp_sk(sk), 0); } static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && READ_ONCE(mptcp_sk(sk)->fully_established); } void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); int mptcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq); static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) { if (use_64bit) return cur_seq; return __mptcp_expand_seq(old_seq, cur_seq); } void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit); static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk) { return READ_ONCE(msk->snd_data_fin_enable) && READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt); } static inline u32 mptcp_notsent_lowat(const struct sock *sk) { struct net *net = sock_net(sk); u32 val; val = READ_ONCE(mptcp_sk(sk)->notsent_lowat); return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake) { const struct mptcp_sock *msk = mptcp_sk(sk); u32 notsent_bytes; notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt); return (notsent_bytes << wake) < mptcp_notsent_lowat(sk); } static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake) { return mptcp_stream_memory_free(sk, wake) && __sk_stream_is_writeable(sk, wake); } static inline void mptcp_write_space(struct sock *sk) { /* pairs with memory barrier in mptcp_poll */ smp_mb(); if (mptcp_stream_memory_free(sk, 1)) sk_stream_write_space(sk); } static inline void __mptcp_sync_sndbuf(struct sock *sk) { struct mptcp_subflow_context *subflow; int ssk_sndbuf, new_sndbuf; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; new_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]); mptcp_for_each_subflow(mptcp_sk(sk), subflow) { ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf); subflow->cached_sndbuf = ssk_sndbuf; new_sndbuf += ssk_sndbuf; } /* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */ WRITE_ONCE(sk->sk_sndbuf, new_sndbuf); mptcp_write_space(sk); } /* The called held both the msk socket and the subflow socket locks, * possibly under BH */ static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf) __mptcp_sync_sndbuf(sk); } /* the caller held only the subflow socket lock, either in process or * BH context. Additionally this can be called under the msk data lock, * so we can't acquire such lock here: let the delegate action acquires * the needed locks in suitable order. */ static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf)) return; local_bh_disable(); mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF); local_bh_enable(); } #define MPTCP_TOKEN_MAX_RETRIES 4 void __init mptcp_token_init(void); static inline void mptcp_token_init_request(struct request_sock *req) { mptcp_subflow_rsk(req)->token_node.pprev = NULL; } int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(struct request_sock *req); int mptcp_token_new_connect(struct sock *ssk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token); struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num); void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_reset(struct mptcp_sock *msk); void mptcp_pm_destroy(struct mptcp_sock *msk); int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr); int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry); bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk); void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct mptcp_subflow_context *subflow); void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup); void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id); void mptcp_pm_rm_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, struct mptcp_addr_info *rem, u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id); bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, const struct mptcp_addr_info *saddr); bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, struct genl_info *info); int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, struct genl_info *info); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry); /* the default path manager, used in mptcp_pm_unregister */ extern struct mptcp_pm_ops mptcp_pm_kernel; struct mptcp_pm_ops *mptcp_pm_find(const char *name); int mptcp_pm_register(struct mptcp_pm_ops *pm_ops); void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops); int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops); void mptcp_pm_get_available(char *buf, size_t maxlen); void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info); void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); int mptcp_pm_genl_fill_addr(struct sk_buff *msg, struct netlink_callback *cb, struct mptcp_pm_addr_entry *entry); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & (BIT(MPTCP_ADD_ADDR_SIGNAL) | BIT(MPTCP_ADD_ADDR_ECHO)); } static inline bool mptcp_pm_should_add_signal_addr(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); } static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk) { return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE; } static inline bool mptcp_pm_is_kernel(const struct mptcp_sock *msk) { return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_KERNEL; } static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; if (family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; if (!echo) len += MPTCPOPT_THMAC_LEN; /* account for 2 trailing 'nop' options */ if (port) len += TCPOLEN_MPTCP_PORT_LEN + TCPOLEN_MPTCP_PORT_ALIGN; return len; } static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) { if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX) return -EINVAL; return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; } bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, bool *drop_other_suboptions); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *skc); int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *skc); bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info); int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info); static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow) { int local_id = READ_ONCE(subflow->local_id); if (local_id < 0) return 0; return local_id; } void __init mptcp_pm_kernel_register(void); void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_fullmesh_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) { spin_lock_bh(&msk->pm.lock); __mptcp_pm_close_subflow(msk); spin_unlock_bh(&msk->pm.lock); } static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_limit_add_addr_accepted(msk) == 0 && msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk) { return test_bit(MPTCP_FALLBACK_DONE, &msk->flags); } static inline bool mptcp_check_fallback(const struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); return __mptcp_check_fallback(msk); } static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk) { struct sock *ssk = READ_ONCE(msk->first); return ssk && ((1 << inet_sk_state_load(ssk)) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_LISTEN)); } bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib); static inline bool mptcp_try_fallback(struct sock *ssk, int fb_mib) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; struct mptcp_sock *msk; msk = mptcp_sk(sk); if (!__mptcp_try_fallback(msk, fb_mib)) return false; if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { gfp_t saved_allocation = ssk->sk_allocation; /* we are in a atomic (BH) scope, override ssk default for data * fin allocation */ ssk->sk_allocation = GFP_ATOMIC; ssk->sk_shutdown |= SEND_SHUTDOWN; tcp_shutdown(ssk, SEND_SHUTDOWN); ssk->sk_allocation = saved_allocation; } return true; } static inline void mptcp_early_fallback(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, int fb_mib) { subflow->request_mptcp = 0; WARN_ON_ONCE(!__mptcp_try_fallback(msk, fb_mib)); } static inline bool mptcp_check_infinite_map(struct sk_buff *skb) { struct mptcp_ext *mpext; mpext = skb ? mptcp_get_ext(skb) : NULL; if (mpext && mpext->infinite_map) return true; return false; } static inline bool is_active_ssk(struct mptcp_subflow_context *subflow) { return (subflow->request_mptcp || subflow->request_join); } static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) && is_active_ssk(subflow) && !subflow->conn_finished; } #ifdef CONFIG_SYN_COOKIES void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb); bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb); void __init mptcp_join_cookie_init(void); #else static inline void subflow_init_req_cookie_join_save(const struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb) {} static inline bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subflow_req, struct sk_buff *skb) { return false; } static inline void mptcp_join_cookie_init(void) {} #endif #endif /* __MPTCP_PROTOCOL_H */
12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 * * Author: Eric Biederman <ebiederm@xmision.com> */ #include <linux/module.h> #include <linux/ipc.h> #include <linux/nsproxy.h> #include <linux/sysctl.h> #include <linux/uaccess.h> #include <linux/capability.h> #include <linux/ipc_namespace.h> #include <linux/msg.h> #include <linux/slab.h> #include <linux/cred.h> #include "util.h" static int proc_ipc_dointvec_minmax_orphans(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = container_of(table->data, struct ipc_namespace, shm_rmid_forced); int err; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err < 0) return err; if (ns->shm_rmid_forced) shm_destroy_orphaned(ns); return err; } static int proc_ipc_auto_msgmni(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; int dummy = 0; memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = &dummy; if (write) pr_info_once("writing to auto_msgmni has no effect"); return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); } static int proc_ipc_sem_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = container_of(table->data, struct ipc_namespace, sem_ctls); int ret, semmni; semmni = ns->sem_ctls[3]; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret) ret = sem_check_semmni(ns); /* * Reset the semmni value if an error happens. */ if (ret) ns->sem_ctls[3] = semmni; return ret; } int ipc_mni = IPCMNI; int ipc_mni_shift = IPCMNI_SHIFT; int ipc_min_cycle = RADIX_TREE_MAP_SIZE; static const struct ctl_table ipc_sysctls[] = { { .procname = "shmmax", .data = &init_ipc_ns.shm_ctlmax, .maxlen = sizeof(init_ipc_ns.shm_ctlmax), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmall", .data = &init_ipc_ns.shm_ctlall, .maxlen = sizeof(init_ipc_ns.shm_ctlall), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmmni", .data = &init_ipc_ns.shm_ctlmni, .maxlen = sizeof(init_ipc_ns.shm_ctlmni), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { .procname = "shm_rmid_forced", .data = &init_ipc_ns.shm_rmid_forced, .maxlen = sizeof(init_ipc_ns.shm_rmid_forced), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax_orphans, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "msgmax", .data = &init_ipc_ns.msg_ctlmax, .maxlen = sizeof(init_ipc_ns.msg_ctlmax), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "msgmni", .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof(init_ipc_ns.msg_ctlmni), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { .procname = "auto_msgmni", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_ipc_auto_msgmni, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "msgmnb", .data = &init_ipc_ns.msg_ctlmnb, .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "sem", .data = &init_ipc_ns.sem_ctls, .maxlen = 4*sizeof(int), .mode = 0644, .proc_handler = proc_ipc_sem_dointvec, }, #ifdef CONFIG_CHECKPOINT_RESTORE { .procname = "sem_next_id", .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), .mode = 0444, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "msg_next_id", .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), .mode = 0444, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "shm_next_id", .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), .mode = 0444, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, #endif }; static struct ctl_table_set *set_lookup(struct ctl_table_root *root) { return &current->nsproxy->ipc_ns->ipc_set; } static int set_is_seen(struct ctl_table_set *set) { return &current->nsproxy->ipc_ns->ipc_set == set; } static void ipc_set_ownership(struct ctl_table_header *head, kuid_t *uid, kgid_t *gid) { struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, ipc_set); kuid_t ns_root_uid = make_kuid(ns->user_ns, 0); kgid_t ns_root_gid = make_kgid(ns->user_ns, 0); *uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID; *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID; } static int ipc_permissions(struct ctl_table_header *head, const struct ctl_table *table) { int mode = table->mode; #ifdef CONFIG_CHECKPOINT_RESTORE struct ipc_namespace *ns = container_of(head->set, struct ipc_namespace, ipc_set); if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) || (table->data == &ns->ids[IPC_MSG_IDS].next_id) || (table->data == &ns->ids[IPC_SHM_IDS].next_id)) && checkpoint_restore_ns_capable(ns->user_ns)) mode = 0666; else #endif { kuid_t ns_root_uid; kgid_t ns_root_gid; ipc_set_ownership(head, &ns_root_uid, &ns_root_gid); if (uid_eq(current_euid(), ns_root_uid)) mode >>= 6; else if (in_egroup_p(ns_root_gid)) mode >>= 3; } mode &= 7; return (mode << 6) | (mode << 3) | mode; } static struct ctl_table_root set_root = { .lookup = set_lookup, .permissions = ipc_permissions, .set_ownership = ipc_set_ownership, }; bool setup_ipc_sysctls(struct ipc_namespace *ns) { struct ctl_table *tbl; setup_sysctl_set(&ns->ipc_set, &set_root, set_is_seen); tbl = kmemdup(ipc_sysctls, sizeof(ipc_sysctls), GFP_KERNEL); if (tbl) { int i; for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) { if (tbl[i].data == &init_ipc_ns.shm_ctlmax) tbl[i].data = &ns->shm_ctlmax; else if (tbl[i].data == &init_ipc_ns.shm_ctlall) tbl[i].data = &ns->shm_ctlall; else if (tbl[i].data == &init_ipc_ns.shm_ctlmni) tbl[i].data = &ns->shm_ctlmni; else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) tbl[i].data = &ns->shm_rmid_forced; else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) tbl[i].data = &ns->msg_ctlmax; else if (tbl[i].data == &init_ipc_ns.msg_ctlmni) tbl[i].data = &ns->msg_ctlmni; else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb) tbl[i].data = &ns->msg_ctlmnb; else if (tbl[i].data == &init_ipc_ns.sem_ctls) tbl[i].data = &ns->sem_ctls; #ifdef CONFIG_CHECKPOINT_RESTORE else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id; else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id; else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id; #endif else tbl[i].data = NULL; } ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl, ARRAY_SIZE(ipc_sysctls)); } if (!ns->ipc_sysctls) { kfree(tbl); retire_sysctl_set(&ns->ipc_set); return false; } return true; } void retire_ipc_sysctls(struct ipc_namespace *ns) { const struct ctl_table *tbl; tbl = ns->ipc_sysctls->ctl_table_arg; unregister_sysctl_table(ns->ipc_sysctls); retire_sysctl_set(&ns->ipc_set); kfree(tbl); } static int __init ipc_sysctl_init(void) { if (!setup_ipc_sysctls(&init_ipc_ns)) { pr_warn("ipc sysctl registration failed\n"); return -ENOMEM; } return 0; } device_initcall(ipc_sysctl_init); static int __init ipc_mni_extend(char *str) { ipc_mni = IPCMNI_EXTEND; ipc_mni_shift = IPCMNI_EXTEND_SHIFT; ipc_min_cycle = IPCMNI_EXTEND_MIN_CYCLE; pr_info("IPCMNI extended to %d.\n", ipc_mni); return 0; } early_param("ipcmni_extend", ipc_mni_extend);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PAGE_COUNTER_H #define _LINUX_PAGE_COUNTER_H #include <linux/atomic.h> #include <linux/cache.h> #include <linux/limits.h> #include <asm/page.h> struct page_counter { /* * Make sure 'usage' does not share cacheline with any other field in * v2. The memcg->memory.usage is a hot member of struct mem_cgroup. */ atomic_long_t usage; unsigned long failcnt; /* v1-only field */ CACHELINE_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ unsigned long emin; atomic_long_t min_usage; atomic_long_t children_min_usage; /* effective memory.low and memory.low usage tracking */ unsigned long elow; atomic_long_t low_usage; atomic_long_t children_low_usage; unsigned long watermark; /* Latest cg2 reset watermark */ unsigned long local_watermark; /* Keep all the read most fields in a separete cacheline. */ CACHELINE_PADDING(_pad2_); bool protection_support; bool track_failcnt; unsigned long min; unsigned long low; unsigned long high; unsigned long max; struct page_counter *parent; } ____cacheline_internodealigned_in_smp; #if BITS_PER_LONG == 32 #define PAGE_COUNTER_MAX LONG_MAX #else #define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE) #endif /* * Protection is supported only for the first counter (with id 0). */ static inline void page_counter_init(struct page_counter *counter, struct page_counter *parent, bool protection_support) { counter->usage = (atomic_long_t)ATOMIC_LONG_INIT(0); counter->max = PAGE_COUNTER_MAX; counter->parent = parent; counter->protection_support = protection_support; counter->track_failcnt = false; } static inline unsigned long page_counter_read(struct page_counter *counter) { return atomic_long_read(&counter->usage); } void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); static inline void page_counter_set_high(struct page_counter *counter, unsigned long nr_pages) { WRITE_ONCE(counter->high, nr_pages); } int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); static inline void page_counter_reset_watermark(struct page_counter *counter) { unsigned long usage = page_counter_read(counter); /* * Update local_watermark first, so it's always <= watermark * (modulo CPU/compiler re-ordering) */ counter->local_watermark = usage; counter->watermark = usage; } #if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection); #else static inline void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection) {} #endif #endif /* _LINUX_PAGE_COUNTER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BLOCKGROUP_LOCK_H #define _LINUX_BLOCKGROUP_LOCK_H /* * Per-blockgroup locking for ext2 and ext3. * * Simple hashed spinlocking. */ #include <linux/spinlock.h> #include <linux/cache.h> #ifdef CONFIG_SMP #define NR_BG_LOCKS (4 << ilog2(NR_CPUS < 32 ? NR_CPUS : 32)) #else #define NR_BG_LOCKS 1 #endif struct bgl_lock { spinlock_t lock; } ____cacheline_aligned_in_smp; struct blockgroup_lock { struct bgl_lock locks[NR_BG_LOCKS]; }; static inline void bgl_lock_init(struct blockgroup_lock *bgl) { int i; for (i = 0; i < NR_BG_LOCKS; i++) spin_lock_init(&bgl->locks[i].lock); } static inline spinlock_t * bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group) { return &bgl->locks[block_group & (NR_BG_LOCKS-1)].lock; } #endif
87 86 88 85 5 53 53 53 53 53 53 52 52 52 1 1 1 1276 1 900 1 1275 2167 2279 432 1 50 52 53 52 51 52 53 53 52 53 52 53 53 50 53 51 52 53 52 5 5 5 5 5 5 5 19 5 5 5 5 19 6 6 4 6 2301 2309 2304 2306 2310 2304 2311 2294 2276 2273 2276 2283 2277 2305 1392 2304 435 432 431 433 435 194 2176 2181 2175 2171 2177 2173 2181 242 244 242 241 240 242 13 14 40 39 3 3 1 37 37 31 32 31 26 11 39 42 40 39 8 8 8 39 40 40 8 197 48 48 48 48 48 47 9987 10008 9895 9889 9881 9884 3 10121 10120 10117 314 301 13 25 25 25 2 2 2 2 2 2 9799 50 46 8660 9721 9631 9737 163 1144 269 1 3 1167 1151 1150 267 1155 262 2 2 2 5 5 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 3 2 1 1 1 4 4 3 3 3 2 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <linux/file_ref.h> #include <net/sock.h> #include <linux/init_task.h> #include "internal.h" static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) { /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { atomic_long_set(&ref->refcnt, FILE_REF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > FILE_REF_MAXREF) atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); return false; } /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count * @cnt: Current reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) { /* Did this drop the last reference? */ if (likely(cnt == FILE_REF_NOREF)) { /* * Carefully try to set the reference count to FILE_REF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int copy_words) { unsigned int nwords = fdt_words(nfdt); bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, copy_words, nwords); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". */ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; unsigned int nr; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. Since we called only * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab * already gives BITS_PER_LONG slots), the above boils down to * 1. use the smallest power of two large enough to give us that many * slots. * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is * 256 slots (i.e. 1Kb fd array). * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there * and we are never going to be asked for 64 or less. */ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) nr = 256; else nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) { nr = round_down(sysctl_nr_open, BITS_PER_LONG); if (nr < slots_wanted) return ERR_PTR(-EMFILE); } /* * Check if the allocation size would exceed INT_MAX. kvmalloc_array() * and kvmalloc() will warn if the allocation size is greater than * INT_MAX, as filp_cachep objects are not __GFP_NOWARN. * * This can happen when sysctl_nr_open is set to a very high value and * a process tries to use a file descriptor near that limit. For example, * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what * systemd typically sets it to - then trying to use a file descriptor * close to that value will require allocating a file descriptor table * that exceeds 8GB in size. */ if (unlikely(nr > INT_MAX / sizeof(struct file *))) return ERR_PTR(-EMFILE); fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (IS_ERR(new_fdt)) return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return 0; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* Can we expand? */ if (unlikely(nr >= sysctl_nr_open)) return -EMFILE; /* All good, so we try */ files->resize_in_progress = true; error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return error; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, bool set) { if (set) { __set_bit(fd, fdt->close_on_exec); } else { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (test_bit(fd, fdt->full_fds_bits)) __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * punch_hole is optional - when close_range() is asked to unshare * and close, we don't need to copy descriptors in that range, so * a smaller cloned descriptor table might suffice if the last * currently opened descriptor falls into that range. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); if (last == fdt->max_fds) return NR_OPEN_DEFAULT; if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { last = find_last_bit(fdt->open_fds, punch_hole->from); if (last == punch_hole->from) return NR_OPEN_DEFAULT; } return ALIGN(last + 1, BITS_PER_LONG); } /* * Allocate a new descriptor table and copy contents from the passed in * instance. Returns a pointer to cloned table on success, ERR_PTR() * on failure. For 'punch_hole' see sane_fdtable_size(). */ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files); if (IS_ERR(new_fdt)) { kmem_cache_free(files_cachep, newf); return ERR_CAST(new_fdt); } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; /* * We may be racing against fd allocation from other threads using this * files_struct, despite holding ->file_lock. * * alloc_fd() might have already claimed a slot, while fd_install() * did not populate it yet. Note the latter operates locklessly, so * the file can show up as we are walking the array below. * * At the same time we know no files will disappear as all other * operations take the lock. * * Instead of trying to placate userspace racing with itself, we * ref the file if we see it and mark the fd slot as unused otherwise. */ for (i = open_files; i != 0; i--) { struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; unsigned int bit; /* * Try to avoid looking at the second level bitmap */ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, start & (BITS_PER_LONG - 1)); if (bit < BITS_PER_LONG) return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ static int alloc_fd(unsigned start, unsigned end, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (unlikely(fd >= end)) goto out; if (unlikely(fd >= fdt->max_fds)) { error = expand_files(files, fd); if (error < 0) goto out; goto repeat; } if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); return error; } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /* * Install a file pointer in the fd array while it is being resized. * * We need to make sure our update to the array does not get lost as the resizing * thread can be copying the content as we modify it. * * We have two ways to do it: * - go off CPU waiting for resize_in_progress to clear * - take the spin lock * * The latter is trivial to implement and saves us from having to might_sleep() * for debugging purposes. * * This is moved out of line from fd_install() to convince gcc to optimize that * routine better. */ static void noinline fd_install_slowpath(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); } /** * fd_install - install a file pointer in the fd array * @fd: file descriptor to install the file in * @file: the file to install * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) return; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); fd_install_slowpath(fd, file); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); /** * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; lockdep_assert_held(&files->file_lock); if (fd >= fdt->max_fds) return NULL; fd = array_index_nospec(fd, fdt->max_fds); file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); } return file; } int close_fd(unsigned fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * * Returns: Last valid index into fdtable. */ static inline unsigned last_fd(struct fdtable *fdt) { return fdt->max_fds - 1; } static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); max_fd = min(last_fd(fdt), max_fd); if (fd <= max_fd) bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { struct file *file; unsigned n; spin_lock(&files->file_lock); n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } else if (need_resched()) { spin_unlock(&files->file_lock); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } /** * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { struct fd_range range = {fd, max_fd}, *punch_hole = &range; /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ if (flags & CLOSE_RANGE_CLOEXEC) punch_hole = NULL; fds = dup_fd(cur_fds, punch_hole); if (IS_ERR(fds)) return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else __range_close(cur_fds, fd, max_fd); if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /** * file_close_fd - return file associated with fd * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static struct file *__get_file_rcu(struct file __rcu **f) { struct file __rcu *file; struct file __rcu *file_reloaded; struct file __rcu *file_reloaded_cmp; file = rcu_dereference_raw(*f); if (!file) return NULL; if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); /* * Ensure that all accesses have a dependency on the load from * rcu_dereference_raw() above so we get correct ordering * between reuse/allocation and the pointer check below. */ file_reloaded_cmp = file_reloaded; OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* * file_ref_get() above provided a full memory barrier when we * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still * matches the current file, we know we have successfully * acquired a reference to the right file. * * If the pointers don't match the file has been reallocated by * SLAB_TYPESAFE_BY_RCU. */ if (file == file_reloaded_cmp) return file_reloaded; fput(file); return ERR_PTR(-EAGAIN); } /** * get_file_rcu - try go get a reference to a file under rcu * @f: the file to get a reference on * * This function tries to get a reference on @f carefully verifying that * @f hasn't been reused. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_rcu(struct file __rcu **f) { for (;;) { struct file __rcu *file; file = __get_file_rcu(f); if (!IS_ERR(file)) return file; } } EXPORT_SYMBOL_GPL(get_file_rcu); /** * get_file_active - try go get a reference to a file * @f: the file to get a reference on * * In contast to get_file_rcu() the pointer itself isn't part of the * reference counting. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_active(struct file **f) { struct file __rcu *file; rcu_read_lock(); file = __get_file_rcu(f); rcu_read_unlock(); if (IS_ERR(file)) file = NULL; return file; } EXPORT_SYMBOL_GPL(get_file_active); static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; unsigned long nospec_mask; /* Mask is a 0 for invalid fd's, ~0 for valid ones */ nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* * fdentry points to the 'fd' offset, or fdt->fd[0]. * Loading from fdt->fd[0] is always safe, because the * array always exists. */ fdentry = fdt->fd + (fd & nospec_mask); /* Do the load, then mask any invalid result */ file = rcu_dereference_raw(*fdentry); file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; /* * Ok, we have a file pointer that was valid at * some point, but it might have become stale since. * * We need to confirm it by incrementing the refcount * and then check the lookup again. * * file_ref_get() gives us a full memory barrier. We * only really need an 'acquire' one to protect the * loads below, but we don't have that. */ if (unlikely(!file_ref_get(&file->f_ref))) continue; /* * Such a race can take two forms: * * (a) the file ref already went down to zero and the * file hasn't been reused yet or the file count * isn't zero but the file has already been reused. * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our ref and try again. */ if (unlikely(file != rcu_dereference_raw(*fdentry)) || unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* * This isn't the file we're looking for or we're not * allowed to get a reference to it. */ if (unlikely(file->f_mode & mask)) { fput(file); return NULL; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask) { return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0); task_unlock(task); return file; } struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; unsigned int fd = *ret_fd; struct file *file = NULL; task_lock(task); files = task->files; if (files) { rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. * * (As an exception to rule 2, you can call filp_close between fget_light and * fput_light provided that you capture a real refcount with get_file before * the call to filp_close, and ensure that this real refcount is fput *after* * the fput_light call.) * * See also the documentation in rust/kernel/file.rs. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; /* * If another thread is concurrently calling close_fd() followed * by put_files_struct(), we must not observe the old table * entry combined with the new refcount - otherwise we could * return a file that is concurrently being freed. * * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return EMPTY_FD; return BORROWED_FD(file); } else { file = __fget_files(files, fd, mask); if (!file) return EMPTY_FD; return CLONED_FD(file); } } struct fd fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(fdget); struct fd fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } /* * Try to avoid f_pos locking. We only need it if the * file is marked for FMODE_ATOMIC_POS, and it can be * accessed multiple ways. * * Always do it for directories, because pidfd_getfd() * can make a file accessible even if it otherwise would * not be, and for directories this is a correctness * issue, not a "POSIX requirement". */ static inline bool file_needs_f_pos_lock(struct file *file) { if (!(file->f_mode & FMODE_ATOMIC_POS)) return false; if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) return true; if (file->f_op->iterate_shared) return true; return false; } bool file_seek_cur_needs_f_lock(struct file *file) { if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) return false; /* * Note that we are not guaranteed to be called after fdget_pos() on * this file obj, in which case the caller is expected to provide the * appropriate locking. */ return true; } struct fd fdget_pos(unsigned int fd) { struct fd f = fdget(fd); struct file *file = fd_file(f); if (likely(file) && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } return f; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { bool res; rcu_read_lock(); res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * dup2() is expected to close the file installed in the target fd slot * (if any). However, userspace hand-picking a fd may be racing against * its own threads which happened to allocate it in open() et al but did * not populate it yet. * * Broadly speaking we may be racing against the following: * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL * file = hard_work_goes_here(); * fd_install(fd, file); // only now ->fd[fd] == file * * It is an invariant that a successfully allocated fd has a NULL entry * in the array until the matching fd_install(). * * If we fit the window, we have the fd to populate, yet no target file * to close. Trying to ignore it and install our new file would violate * the invariant and make fd_install() overwrite our file. * * Things can be done(tm) to handle this. However, the issue does not * concern legitimate programs and we only need to make sure the kernel * does not trip over it. * * The simplest way out is to return an error if we find ourselves here. * * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; err = do_dup2(files, file, fd, flags); if (err < 0) return err; return 0; out_unlock: spin_unlock(&files->file_lock); return err; } /** * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; FD_PREPARE(fdf, o_flags, file); if (fdf.err) return fdf.err; get_file(file); if (ufd) { error = put_user(fd_prepare_fd(fdf), ufd); if (error) return error; } __receive_sock(fd_prepare_file(fdf)); return fd_publish(fdf); } EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; error = replace_fd(new_fd, file, o_flags); if (error) return error; __receive_sock(file); return new_fd; } static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; struct file *f; int retval = oldfd; rcu_read_lock(); f = __fget_files_rcu(files, oldfd, 0); if (!f) retval = -EBADF; rcu_read_unlock(); if (f) fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; if (from >= nofile) return -EINVAL; err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd);
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for some microsoft "special" devices * * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2006-2007 Jiri Kosina * Copyright (c) 2008 Jiri Slaby */ /* */ #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" #define MS_HIDINPUT BIT(0) #define MS_ERGONOMY BIT(1) #define MS_PRESENTER BIT(2) #define MS_RDESC BIT(3) #define MS_NOGET BIT(4) #define MS_DUPLICATE_USAGES BIT(5) #define MS_SURFACE_DIAL BIT(6) #define MS_QUIRK_FF BIT(7) struct ms_data { unsigned long quirks; struct hid_device *hdev; struct work_struct ff_worker; __u8 strong; __u8 weak; void *output_report_dmabuf; }; #define XB1S_FF_REPORT 3 #define ENABLE_WEAK BIT(0) #define ENABLE_STRONG BIT(1) enum { MAGNITUDE_STRONG = 2, MAGNITUDE_WEAK, MAGNITUDE_NUM }; struct xb1s_ff_report { __u8 report_id; __u8 enable; __u8 magnitude[MAGNITUDE_NUM]; __u8 duration_10ms; __u8 start_delay_10ms; __u8 loop_count; } __packed; static const __u8 *ms_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { struct ms_data *ms = hid_get_drvdata(hdev); unsigned long quirks = ms->quirks; /* * Microsoft Wireless Desktop Receiver (Model 1028) has * 'Usage Min/Max' where it ought to have 'Physical Min/Max' */ if ((quirks & MS_RDESC) && *rsize == 571 && rdesc[557] == 0x19 && rdesc[559] == 0x29) { hid_info(hdev, "fixing up Microsoft Wireless Receiver Model 1028 report descriptor\n"); rdesc[557] = 0x35; rdesc[559] = 0x45; } return rdesc; } #define ms_map_key_clear(c) hid_map_usage_clear(hi, usage, bit, max, \ EV_KEY, (c)) static int ms_ergonomy_kb_quirk(struct hid_input *hi, struct hid_usage *usage, unsigned long **bit, int *max) { struct input_dev *input = hi->input; if ((usage->hid & HID_USAGE_PAGE) == HID_UP_CONSUMER) { switch (usage->hid & HID_USAGE) { /* * Microsoft uses these 2 reserved usage ids for 2 keys on * the MS office kb labelled "Office Home" and "Task Pane". */ case 0x29d: ms_map_key_clear(KEY_PROG1); return 1; case 0x29e: ms_map_key_clear(KEY_PROG2); return 1; } return 0; } if ((usage->hid & HID_USAGE_PAGE) != HID_UP_MSVENDOR) return 0; switch (usage->hid & HID_USAGE) { case 0xfd06: ms_map_key_clear(KEY_CHAT); break; case 0xfd07: ms_map_key_clear(KEY_PHONE); break; case 0xff00: /* Special keypad keys */ ms_map_key_clear(KEY_KPEQUAL); set_bit(KEY_KPLEFTPAREN, input->keybit); set_bit(KEY_KPRIGHTPAREN, input->keybit); break; case 0xff01: /* Scroll wheel */ hid_map_usage_clear(hi, usage, bit, max, EV_REL, REL_WHEEL); break; case 0xff02: /* * This byte contains a copy of the modifier keys byte of a * standard hid keyboard report, as send by interface 0 * (this usage is found on interface 1). * * This byte only gets send when another key in the same report * changes state, and as such is useless, ignore it. */ return -1; case 0xff05: set_bit(EV_REP, input->evbit); ms_map_key_clear(KEY_F13); set_bit(KEY_F14, input->keybit); set_bit(KEY_F15, input->keybit); set_bit(KEY_F16, input->keybit); set_bit(KEY_F17, input->keybit); set_bit(KEY_F18, input->keybit); break; default: return 0; } return 1; } static int ms_presenter_8k_quirk(struct hid_input *hi, struct hid_usage *usage, unsigned long **bit, int *max) { if ((usage->hid & HID_USAGE_PAGE) != HID_UP_MSVENDOR) return 0; set_bit(EV_REP, hi->input->evbit); switch (usage->hid & HID_USAGE) { case 0xfd08: ms_map_key_clear(KEY_FORWARD); break; case 0xfd09: ms_map_key_clear(KEY_BACK); break; case 0xfd0b: ms_map_key_clear(KEY_PLAYPAUSE); break; case 0xfd0e: ms_map_key_clear(KEY_CLOSE); break; case 0xfd0f: ms_map_key_clear(KEY_PLAY); break; default: return 0; } return 1; } static int ms_surface_dial_quirk(struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { switch (usage->hid & HID_USAGE_PAGE) { case 0xff070000: case HID_UP_DIGITIZER: /* ignore those axis */ return -1; case HID_UP_GENDESK: switch (usage->hid) { case HID_GD_X: case HID_GD_Y: case HID_GD_RFKILL_BTN: /* ignore those axis */ return -1; } } return 0; } static int ms_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct ms_data *ms = hid_get_drvdata(hdev); unsigned long quirks = ms->quirks; if (quirks & MS_ERGONOMY) { int ret = ms_ergonomy_kb_quirk(hi, usage, bit, max); if (ret) return ret; } if ((quirks & MS_PRESENTER) && ms_presenter_8k_quirk(hi, usage, bit, max)) return 1; if (quirks & MS_SURFACE_DIAL) { int ret = ms_surface_dial_quirk(hi, field, usage, bit, max); if (ret) return ret; } return 0; } static int ms_input_mapped(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct ms_data *ms = hid_get_drvdata(hdev); unsigned long quirks = ms->quirks; if (quirks & MS_DUPLICATE_USAGES) clear_bit(usage->code, *bit); return 0; } static int ms_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct ms_data *ms = hid_get_drvdata(hdev); unsigned long quirks = ms->quirks; struct input_dev *input; if (!(hdev->claimed & HID_CLAIMED_INPUT) || !field->hidinput || !usage->type) return 0; input = field->hidinput->input; /* Handling MS keyboards special buttons */ if (quirks & MS_ERGONOMY && usage->hid == (HID_UP_MSVENDOR | 0xff00)) { /* Special keypad keys */ input_report_key(input, KEY_KPEQUAL, value & 0x01); input_report_key(input, KEY_KPLEFTPAREN, value & 0x02); input_report_key(input, KEY_KPRIGHTPAREN, value & 0x04); return 1; } if (quirks & MS_ERGONOMY && usage->hid == (HID_UP_MSVENDOR | 0xff01)) { /* Scroll wheel */ int step = ((value & 0x60) >> 5) + 1; switch (value & 0x1f) { case 0x01: input_report_rel(input, REL_WHEEL, step); break; case 0x1f: input_report_rel(input, REL_WHEEL, -step); break; } return 1; } if (quirks & MS_ERGONOMY && usage->hid == (HID_UP_MSVENDOR | 0xff05)) { static unsigned int last_key = 0; unsigned int key = 0; switch (value) { case 0x01: key = KEY_F14; break; case 0x02: key = KEY_F15; break; case 0x04: key = KEY_F16; break; case 0x08: key = KEY_F17; break; case 0x10: key = KEY_F18; break; } if (key) { input_event(input, usage->type, key, 1); last_key = key; } else input_event(input, usage->type, last_key, 0); return 1; } return 0; } static void ms_ff_worker(struct work_struct *work) { struct ms_data *ms = container_of(work, struct ms_data, ff_worker); struct hid_device *hdev = ms->hdev; struct xb1s_ff_report *r = ms->output_report_dmabuf; int ret; memset(r, 0, sizeof(*r)); r->report_id = XB1S_FF_REPORT; r->enable = ENABLE_WEAK | ENABLE_STRONG; /* * Specifying maximum duration and maximum loop count should * cover maximum duration of a single effect, which is 65536 * ms */ r->duration_10ms = U8_MAX; r->loop_count = U8_MAX; r->magnitude[MAGNITUDE_STRONG] = ms->strong; /* left actuator */ r->magnitude[MAGNITUDE_WEAK] = ms->weak; /* right actuator */ ret = hid_hw_output_report(hdev, (__u8 *)r, sizeof(*r)); if (ret < 0) hid_warn(hdev, "failed to send FF report\n"); } static int ms_play_effect(struct input_dev *dev, void *data, struct ff_effect *effect) { struct hid_device *hid = input_get_drvdata(dev); struct ms_data *ms = hid_get_drvdata(hid); if (effect->type != FF_RUMBLE) return 0; /* * Magnitude is 0..100 so scale the 16-bit input here */ ms->strong = ((u32) effect->u.rumble.strong_magnitude * 100) / U16_MAX; ms->weak = ((u32) effect->u.rumble.weak_magnitude * 100) / U16_MAX; schedule_work(&ms->ff_worker); return 0; } static int ms_init_ff(struct hid_device *hdev) { struct hid_input *hidinput; struct input_dev *input_dev; struct ms_data *ms = hid_get_drvdata(hdev); if (list_empty(&hdev->inputs)) { hid_err(hdev, "no inputs found\n"); return -ENODEV; } hidinput = list_entry(hdev->inputs.next, struct hid_input, list); input_dev = hidinput->input; if (!(ms->quirks & MS_QUIRK_FF)) return 0; ms->hdev = hdev; INIT_WORK(&ms->ff_worker, ms_ff_worker); ms->output_report_dmabuf = devm_kzalloc(&hdev->dev, sizeof(struct xb1s_ff_report), GFP_KERNEL); if (ms->output_report_dmabuf == NULL) return -ENOMEM; input_set_capability(input_dev, EV_FF, FF_RUMBLE); return input_ff_create_memless(input_dev, NULL, ms_play_effect); } static void ms_remove_ff(struct hid_device *hdev) { struct ms_data *ms = hid_get_drvdata(hdev); if (!(ms->quirks & MS_QUIRK_FF)) return; cancel_work_sync(&ms->ff_worker); } static int ms_probe(struct hid_device *hdev, const struct hid_device_id *id) { unsigned long quirks = id->driver_data; struct ms_data *ms; int ret; ms = devm_kzalloc(&hdev->dev, sizeof(*ms), GFP_KERNEL); if (ms == NULL) return -ENOMEM; ms->quirks = quirks; hid_set_drvdata(hdev, ms); if (quirks & MS_NOGET) hdev->quirks |= HID_QUIRK_NOGET; if (quirks & MS_SURFACE_DIAL) hdev->quirks |= HID_QUIRK_INPUT_PER_APP; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err_free; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT | ((quirks & MS_HIDINPUT) ? HID_CONNECT_HIDINPUT_FORCE : 0)); if (ret) { hid_err(hdev, "hw start failed\n"); goto err_free; } ret = ms_init_ff(hdev); if (ret) hid_err(hdev, "could not initialize ff, continuing anyway"); return 0; err_free: return ret; } static void ms_remove(struct hid_device *hdev) { hid_hw_stop(hdev); ms_remove_ff(hdev); } static const struct hid_device_id ms_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_SIDEWINDER_GV), .driver_data = MS_HIDINPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_OFFICE_KB), .driver_data = MS_ERGONOMY }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_NE4K), .driver_data = MS_ERGONOMY }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_NE4K_JP), .driver_data = MS_ERGONOMY }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_NE7K), .driver_data = MS_ERGONOMY }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_LK6K), .driver_data = MS_ERGONOMY | MS_RDESC }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_PRESENTER_8K_USB), .driver_data = MS_PRESENTER }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_DIGITAL_MEDIA_3K), .driver_data = MS_ERGONOMY }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_DIGITAL_MEDIA_7K), .driver_data = MS_ERGONOMY }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_DIGITAL_MEDIA_600), .driver_data = MS_ERGONOMY }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_DIGITAL_MEDIA_3KV1), .driver_data = MS_ERGONOMY }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_WIRELESS_OPTICAL_DESKTOP_3_0), .driver_data = MS_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_COMFORT_MOUSE_4500), .driver_data = MS_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_POWER_COVER), .driver_data = MS_HIDINPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_COMFORT_KEYBOARD), .driver_data = MS_ERGONOMY}, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_PRESENTER_8K_BT), .driver_data = MS_PRESENTER }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, 0x091B), .driver_data = MS_SURFACE_DIAL }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_XBOX_CONTROLLER_MODEL_1708), .driver_data = MS_QUIRK_FF }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_XBOX_CONTROLLER_MODEL_1708_BLE), .driver_data = MS_QUIRK_FF }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_XBOX_CONTROLLER_MODEL_1914), .driver_data = MS_QUIRK_FF }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_XBOX_CONTROLLER_MODEL_1797), .driver_data = MS_QUIRK_FF }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_MS_XBOX_CONTROLLER_MODEL_1797_BLE), .driver_data = MS_QUIRK_FF }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_MICROSOFT, USB_DEVICE_ID_8BITDO_SN30_PRO_PLUS), .driver_data = MS_QUIRK_FF }, { } }; MODULE_DEVICE_TABLE(hid, ms_devices); static struct hid_driver ms_driver = { .name = "microsoft", .id_table = ms_devices, .report_fixup = ms_report_fixup, .input_mapping = ms_input_mapping, .input_mapped = ms_input_mapped, .event = ms_event, .probe = ms_probe, .remove = ms_remove, }; module_hid_driver(ms_driver); MODULE_DESCRIPTION("HID driver for some microsoft \"special\" devices"); MODULE_LICENSE("GPL");
81 4 8 553 480 88 547 91 545 100 100 100 100 99 268 46 234 64 64 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_NETDEV_LOCK_H #define _NET_NETDEV_LOCK_H #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> static inline bool netdev_trylock(struct net_device *dev) { return mutex_trylock(&dev->lock); } static inline void netdev_assert_locked(const struct net_device *dev) { lockdep_assert_held(&dev->lock); } static inline void netdev_assert_locked_or_invisible(const struct net_device *dev) { if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) netdev_assert_locked(dev); } static inline bool netdev_need_ops_lock(const struct net_device *dev) { bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; #if IS_ENABLED(CONFIG_NET_SHAPER) ret |= !!dev->netdev_ops->net_shaper_ops; #endif return ret; } static inline void netdev_lock_ops(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_lock(dev); } static inline void netdev_unlock_ops(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_unlock(dev); } static inline void netdev_lock_ops_to_full(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_assert_locked(dev); else netdev_lock(dev); } static inline void netdev_unlock_full_to_ops(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_assert_locked(dev); else netdev_unlock(dev); } static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) lockdep_assert_held(&dev->lock); else ASSERT_RTNL(); } static inline void netdev_ops_assert_locked_or_invisible(const struct net_device *dev) { if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) netdev_ops_assert_locked(dev); } static inline void netdev_lock_ops_compat(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_lock(dev); else rtnl_lock(); } static inline void netdev_unlock_ops_compat(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_unlock(dev); else rtnl_unlock(); } static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { if (a == b) return 0; /* Allow locking multiple devices only under rtnl_lock, * the exact order doesn't matter. * Note that upper devices don't lock their ops, so nesting * mostly happens in batched device removal for now. */ return lockdep_rtnl_is_held() ? -1 : 1; } #define netdev_lockdep_set_classes(dev) \ { \ static struct lock_class_key qdisc_tx_busylock_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ static struct lock_class_key dev_addr_list_lock_key; \ static struct lock_class_key dev_instance_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ lockdep_set_class(&(dev)->addr_list_lock, \ &dev_addr_list_lock_key); \ lockdep_set_class(&(dev)->lock, \ &dev_instance_lock_key); \ lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ &qdisc_xmit_lock_key); \ } #define netdev_lock_dereference(p, dev) \ rcu_dereference_protected(p, lockdep_is_held(&(dev)->lock)) int netdev_debug_event(struct notifier_block *nb, unsigned long event, void *ptr); #endif
5 9 18 43 42 37 37 8 4 3 3 3 20 1 23 39 78 78 3 2 1 3 3 13 5 5 5 5 2 5 8 1 9 7 3 7 3 10 5 8 3 32 35 9 1 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_IP_TUNNELS_H #define __NET_IP_TUNNELS_H 1 #include <linux/if_tunnel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <linux/u64_stats_sync.h> #include <linux/bitops.h> #include <net/dsfield.h> #include <net/flow.h> #include <net/gro_cells.h> #include <net/inet_dscp.h> #include <net/inet_ecn.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/lwtunnel.h> #include <net/dst_cache.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #endif /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) /* Used to memset ip_tunnel padding. */ #define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) /* Used to memset ipv4 address padding. */ #define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) #define IP_TUNNEL_KEY_IPV4_PAD_LEN \ (sizeof_field(struct ip_tunnel_key, u) - \ sizeof_field(struct ip_tunnel_key, u.ipv4)) #define __ipt_flag_op(op, ...) \ op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM) #define IP_TUNNEL_DECLARE_FLAGS(...) \ __ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__) #define ip_tunnel_flags_zero(...) __ipt_flag_op(bitmap_zero, __VA_ARGS__) #define ip_tunnel_flags_copy(...) __ipt_flag_op(bitmap_copy, __VA_ARGS__) #define ip_tunnel_flags_and(...) __ipt_flag_op(bitmap_and, __VA_ARGS__) #define ip_tunnel_flags_or(...) __ipt_flag_op(bitmap_or, __VA_ARGS__) #define ip_tunnel_flags_empty(...) \ __ipt_flag_op(bitmap_empty, __VA_ARGS__) #define ip_tunnel_flags_intersect(...) \ __ipt_flag_op(bitmap_intersects, __VA_ARGS__) #define ip_tunnel_flags_subset(...) \ __ipt_flag_op(bitmap_subset, __VA_ARGS__) struct ip_tunnel_key { __be64 tun_id; union { struct { __be32 src; __be32 dst; } ipv4; struct { struct in6_addr src; struct in6_addr dst; } ipv6; } u; IP_TUNNEL_DECLARE_FLAGS(tun_flags); __be32 label; /* Flow Label for IPv6 */ u32 nhid; u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be16 tp_src; __be16 tp_dst; __u8 flow_flags; }; struct ip_tunnel_encap { u16 type; u16 flags; __be16 sport; __be16 dport; }; /* Flags for ip_tunnel_info mode. */ #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ #define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ #define IP_TUNNEL_INFO_BRIDGE 0x04 /* represents a bridged tunnel id */ /* Maximum tunnel options length. */ #define IP_TUNNEL_OPTS_MAX \ GENMASK((sizeof_field(struct ip_tunnel_info, \ options_len) * BITS_PER_BYTE) - 1, 0) #define ip_tunnel_info_opts(info) \ _Generic(info, \ const struct ip_tunnel_info * : ((const void *)(info)->options),\ struct ip_tunnel_info * : ((void *)(info)->options)\ ) struct ip_tunnel_info { struct ip_tunnel_key key; struct ip_tunnel_encap encap; #ifdef CONFIG_DST_CACHE struct dst_cache dst_cache; #endif u8 options_len; u8 mode; u8 options[] __aligned_largest __counted_by(options_len); }; /* 6rd prefix/relay information */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm { struct in6_addr prefix; __be32 relay_prefix; u16 prefixlen; u16 relay_prefixlen; }; #endif struct ip_tunnel_prl_entry { struct ip_tunnel_prl_entry __rcu *next; __be32 addr; u16 flags; struct rcu_head rcu_head; }; struct metadata_dst; /* Kernel-side variant of ip_tunnel_parm */ struct ip_tunnel_parm_kern { char name[IFNAMSIZ]; IP_TUNNEL_DECLARE_FLAGS(i_flags); IP_TUNNEL_DECLARE_FLAGS(o_flags); __be32 i_key; __be32 o_key; int link; struct iphdr iph; }; struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; struct net_device *dev; netdevice_tracker dev_tracker; struct net *net; /* netns for packet i/o */ unsigned long err_time; /* Time when the last ICMP error * arrived */ int err_count; /* Number of arrived ICMP errors */ /* These four fields used only by GRE */ u32 i_seqno; /* The last seen seqno */ atomic_t o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ /* These four fields used only by ERSPAN */ u32 index; /* ERSPAN type II index */ u8 erspan_ver; /* ERSPAN version */ u8 dir; /* ERSPAN direction */ u16 hwid; /* ERSPAN hardware ID */ struct dst_cache dst_cache; struct ip_tunnel_parm_kern parms; int mlink; int encap_hlen; /* Encap header length (FOU,GUE) */ int hlen; /* tun_hlen + encap_hlen */ struct ip_tunnel_encap encap; /* for SIT */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm ip6rd; #endif struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ unsigned int prl_count; /* # of entries in PRL */ unsigned int ip_tnl_net_id; struct gro_cells gro_cells; __u32 fwmark; bool collect_md; bool ignore_df; }; struct tnl_ptk_info { IP_TUNNEL_DECLARE_FLAGS(flags); __be16 proto; __be32 key; __be32 seq; int hdr_len; }; #define PACKET_RCVD 0 #define PACKET_REJECT 1 #define PACKET_NEXT 2 #define IP_TNL_HASH_BITS 7 #define IP_TNL_HASH_SIZE (1 << IP_TNL_HASH_BITS) struct ip_tunnel_net { struct net_device *fb_tunnel_dev; struct rtnl_link_ops *rtnl_link_ops; struct hlist_head tunnels[IP_TNL_HASH_SIZE]; struct ip_tunnel __rcu *collect_md_tun; int type; }; static inline void ip_tunnel_set_options_present(unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); ip_tunnel_flags_or(flags, flags, present); } static inline void ip_tunnel_clear_options_present(unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); __ipt_flag_op(bitmap_andnot, flags, flags, present); } static inline bool ip_tunnel_is_options_present(const unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(present) = { }; __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present); __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present); __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present); __set_bit(IP_TUNNEL_GTP_OPT_BIT, present); __set_bit(IP_TUNNEL_PFCP_OPT_BIT, present); return ip_tunnel_flags_intersect(flags, present); } static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags) { IP_TUNNEL_DECLARE_FLAGS(supp) = { }; bitmap_set(supp, 0, BITS_PER_TYPE(__be16)); __set_bit(IP_TUNNEL_VTI_BIT, supp); return ip_tunnel_flags_subset(flags, supp); } static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags) { ip_tunnel_flags_zero(dst); bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16)); __assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI); } static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags) { __be16 ret; ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16))); if (test_bit(IP_TUNNEL_VTI_BIT, flags)) ret |= VTI_ISVTI; return ret; } static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, __be64 tun_id, const unsigned long *tun_flags) { key->tun_id = tun_id; key->u.ipv4.src = saddr; key->u.ipv4.dst = daddr; memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); key->tos = tos; key->ttl = ttl; key->label = label; ip_tunnel_flags_copy(key->tun_flags, tun_flags); /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. * E.g: GRE over IPSEC, the tp_src and tp_port are zero. */ key->tp_src = tp_src; key->tp_dst = tp_dst; /* Clear struct padding. */ if (sizeof(*key) != IP_TUNNEL_KEY_SIZE) memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE, 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } static inline bool ip_tunnel_dst_cache_usable(const struct sk_buff *skb, const struct ip_tunnel_info *info) { if (skb->mark) return false; return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) { return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; } static inline __be64 key32_to_tunnel_id(__be32 key) { #ifdef __BIG_ENDIAN return (__force __be64)key; #else return (__force __be64)((__force u64)key << 32); #endif } /* Returns the least-significant 32 bits of a __be64. */ static inline __be32 tunnel_id_to_key32(__be64 tun_id) { #ifdef __BIG_ENDIAN return (__force __be32)tun_id; #else return (__force __be32)((__force u64)tun_id >> 32); #endif } #ifdef CONFIG_INET static inline void ip_tunnel_init_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, __be32 key, __u8 tos, struct net *net, int oif, __u32 mark, __u32 tun_inner_hash, __u8 flow_flags) { memset(fl4, 0, sizeof(*fl4)); if (oif) { fl4->flowi4_l3mdev = l3mdev_master_upper_ifindex_by_index(net, oif); /* Legacy VRF/l3mdev use case */ fl4->flowi4_oif = fl4->flowi4_l3mdev ? 0 : oif; } fl4->daddr = daddr; fl4->saddr = saddr; fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; fl4->flowi4_multipath_hash = tun_inner_hash; fl4->flowi4_flags = flow_flags; } int ip_tunnel_init(struct net_device *dev); void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); void ip_tunnel_delete_net(struct net *net, unsigned int id, struct rtnl_link_ops *ops, struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const u8 proto, int tunnel_hlen); int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd); bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp, const void __user *data); bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, int link, const unsigned long *flags, __be32 remote, __be32 local, __be32 key); void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); int ip_tunnel_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap); void ip_tunnel_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm_kern *parms); extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4); int (*err_handler)(struct sk_buff *skb, u32 info); }; #define MAX_IPTUN_ENCAP_OPS 8 extern const struct ip_tunnel_encap_ops __rcu * iptun_encaps[MAX_IPTUN_ENCAP_OPS]; int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); static inline enum skb_drop_reason pskb_inet_may_pull_reason(struct sk_buff *skb) { int nhlen; switch (skb->protocol) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; default: nhlen = 0; } return pskb_network_may_pull_reason(skb, nhlen); } static inline bool pskb_inet_may_pull(struct sk_buff *skb) { return pskb_inet_may_pull_reason(skb) == SKB_NOT_DROPPED_YET; } /* Variant of pskb_inet_may_pull(). */ static inline enum skb_drop_reason skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit) { int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN; __be16 type = skb->protocol; enum skb_drop_reason reason; /* Essentially this is skb_protocol(skb, true) * And we get MAC len. */ if (eth_type_vlan(type)) type = __vlan_get_protocol(skb, type, &maclen); switch (type) { #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): nhlen = sizeof(struct ipv6hdr); break; #endif case htons(ETH_P_IP): nhlen = sizeof(struct iphdr); break; } /* For ETH_P_IPV6/ETH_P_IP we make sure to pull * a base network header in skb->head. */ reason = pskb_may_pull_reason(skb, maclen + nhlen); if (reason) return reason; skb_set_network_header(skb, maclen); return SKB_NOT_DROPPED_YET; } static inline int ip_encap_hlen(struct ip_tunnel_encap *e) { const struct ip_tunnel_encap_ops *ops; int hlen = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->encap_hlen)) hlen = ops->encap_hlen(e); rcu_read_unlock(); return hlen; } static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { const struct ip_tunnel_encap_ops *ops; int ret = -EINVAL; if (e->type == TUNNEL_ENCAP_NONE) return 0; if (e->type >= MAX_IPTUN_ENCAP_OPS) return -EINVAL; rcu_read_lock(); ops = rcu_dereference(iptun_encaps[e->type]); if (likely(ops && ops->build_header)) ret = ops->build_header(skb, e, protocol, fl4); rcu_read_unlock(); return ret; } /* Extract dsfield from inner protocol */ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->tos; else if (payload_protocol == htons(ETH_P_IPV6)) return ipv6_get_dsfield((const struct ipv6hdr *)iph); else return 0; } static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IPV6)) return ip6_flowlabel((const struct ipv6hdr *)iph); else return 0; } static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { __be16 payload_protocol = skb_protocol(skb, true); if (payload_protocol == htons(ETH_P_IP)) return iph->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) return ((const struct ipv6hdr *)iph)->hop_limit; else return 0; } /* Propagate ECN bits out */ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, const struct sk_buff *skb) { u8 inner = ip_tunnel_get_dsfield(iph, skb); return INET_ECN_encapsulate(tos, inner); } int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool raw_proto, bool xnet); static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, bool xnet) { return __iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet); } void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply); static inline void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom) { /* we must cap headroom to some upperlimit, else pskb_expand_head * will overflow header offsets in skb_headers_offset_update(). */ const unsigned int max_allowed = 512; if (headroom > max_allowed) headroom = max_allowed; if (headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, headroom); } int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline int iptunnel_pull_offloads(struct sk_buff *skb) { if (skb_is_gso(skb)) { int err; err = skb_unclone(skb, GFP_ATOMIC); if (unlikely(err)) return err; skb_shinfo(skb)->gso_type &= ~(NETIF_F_GSO_ENCAP_ALL >> NETIF_F_GSO_SHIFT); } skb->encapsulation = 0; return 0; } static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { if (pkt_len > 0) { struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->tx_bytes, pkt_len); u64_stats_inc(&tstats->tx_packets); u64_stats_update_end(&tstats->syncp); put_cpu_ptr(tstats); return; } if (pkt_len < 0) { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } else { DEV_STATS_INC(dev, tx_dropped); } } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { memcpy(to, ip_tunnel_info_opts(info), info->options_len); } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, const unsigned long *flags) { info->options_len = len; if (len > 0) { memcpy(ip_tunnel_info_opts(info), from, len); ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags, flags); } } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return (struct ip_tunnel_info *)lwtstate->data; } DECLARE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); /* Returns > 0 if metadata should be collected */ static inline int ip_tunnel_collect_metadata(void) { return static_branch_unlikely(&ip_tunnel_metadata_cnt); } void __init ip_tunnel_core_init(void); void ip_tunnel_need_metadata(void); void ip_tunnel_unneed_metadata(void); #else /* CONFIG_INET */ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return NULL; } static inline void ip_tunnel_need_metadata(void) { } static inline void ip_tunnel_unneed_metadata(void) { } static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, const unsigned long *flags) { info->options_len = 0; } #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */
2 1 3 3 3 1 2 1 2 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 3 3 3 3 2 1 3 3 3 1 2 2 1 2 3 3 1 3 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 // SPDX-License-Identifier: GPL-2.0-or-later /* * udp_diag.c Module for monitoring UDP transport protocols sockets. * * Authors: Pavel Emelyanov, <xemul@parallels.com> */ #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/udp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/sock_diag.h> static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, bool net_admin) { if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; int err; struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); rcu_read_lock(); if (req->sdiag_family == AF_INET) /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); if (err) goto out; err = -ENOMEM; rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_put(sk); out_nosk: return err; } static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); int num, s_num, slot, s_slot; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { struct udp_hslot *hslot = &table->hash[slot]; struct sock *sk; num = 0; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } next: num++; } spin_unlock_bh(&hslot->lock); } done: cb->args[0] = slot; cb->args[1] = num; } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = udp_rqueue_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int __udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req, struct udp_table *tbl) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); else sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); } #endif else { rcu_read_unlock(); return -EINVAL; } if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); if (!sk) return -ENOENT; if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { sock_put(sk); return -ENOENT; } err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } static int udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); } static int udplite_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, &udplite_table); } #endif static const struct inet_diag_handler udp_diag_handler = { .owner = THIS_MODULE, .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udp_diag_destroy, #endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(&udplite_table, skb, cb, r); } static int udplite_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(&udplite_table, cb, req); } static const struct inet_diag_handler udplite_diag_handler = { .owner = THIS_MODULE, .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udplite_diag_destroy, #endif }; static int __init udp_diag_init(void) { int err; err = inet_diag_register(&udp_diag_handler); if (err) goto out; err = inet_diag_register(&udplite_diag_handler); if (err) goto out_lite; out: return err; out_lite: inet_diag_unregister(&udp_diag_handler); goto out; } static void __exit udp_diag_exit(void) { inet_diag_unregister(&udplite_diag_handler); inet_diag_unregister(&udp_diag_handler); } module_init(udp_diag_init); module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
1 1 2 2 1 1 2 2 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" static inline bool devlink_rate_is_leaf(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; } static inline bool devlink_rate_is_node(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; } static struct devlink_rate * devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) { struct devlink_rate *devlink_rate; struct devlink_port *devlink_port; devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); if (IS_ERR(devlink_port)) return ERR_CAST(devlink_port); devlink_rate = devlink_port->devlink_rate; return devlink_rate ?: ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && !strcmp(node_name, devlink_rate->name)) return devlink_rate; } return ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { const char *rate_node_name; size_t len; if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return ERR_PTR(-EINVAL); rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); len = strlen(rate_node_name); /* Name cannot be empty or decimal number */ if (!len || strspn(rate_node_name, "0123456789") == len) return ERR_PTR(-EINVAL); return devlink_rate_node_get_by_name(devlink, rate_node_name); } static struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } static struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; if (attrs[DEVLINK_ATTR_PORT_INDEX]) return devlink_rate_leaf_get_from_info(devlink, info); else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return devlink_rate_node_get_from_info(devlink, info); else return ERR_PTR(-EINVAL); } static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) { struct nlattr *nla_tc_bw; int i; for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS); if (!nla_tc_bw) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_RATE_TC_ATTR_INDEX, i) || nla_put_u32(msg, DEVLINK_RATE_TC_ATTR_BW, tc_bw[i])) goto nla_put_failure; nla_nest_end(msg, nla_tc_bw); } return 0; nla_put_failure: nla_nest_cancel(msg, nla_tc_bw); return -EMSGSIZE; } static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_rate->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) goto nla_put_failure; if (devlink_rate_is_leaf(devlink_rate)) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_rate->devlink_port->index)) goto nla_put_failure; } else if (devlink_rate_is_node(devlink_rate)) { if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, devlink_rate->name)) goto nla_put_failure; } if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_SHARE, devlink_rate->tx_share)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_MAX, devlink_rate->tx_max)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, devlink_rate->tx_priority)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, devlink_rate->tx_weight)) goto nla_put_failure; if (devlink_rate->parent) if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, devlink_rate->parent->name)) goto nla_put_failure; if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { struct devlink *devlink = devlink_rate->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_rates_notify_register(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); } void devlink_rates_notify_unregister(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); } static int devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; int idx = 0; int err = 0; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; u32 id = NETLINK_CB(cb->skb).portid; if (idx < state->idx) { idx++; continue; } err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, cb->nlh->nlmsg_seq, flags, NULL); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one); } int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; struct sk_buff *msg; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static bool devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, struct devlink_rate *parent) { while (parent) { if (parent == devlink_rate) return true; parent = parent->parent; } return false; } static int devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, struct genl_info *info, struct nlattr *nla_parent) { struct devlink *devlink = devlink_rate->devlink; const char *parent_name = nla_data(nla_parent); const struct devlink_ops *ops = devlink->ops; size_t len = strlen(parent_name); struct devlink_rate *parent; int err = -EOPNOTSUPP; parent = devlink_rate->parent; if (parent && !len) { if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); if (err) return err; refcount_dec(&parent->refcnt); devlink_rate->parent = NULL; } else if (len) { parent = devlink_rate_node_get_by_name(devlink, parent_name); if (IS_ERR(parent)) return -ENODEV; if (parent == devlink_rate) { NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed"); return -EINVAL; } if (devlink_rate_is_node(devlink_rate) && devlink_rate_is_parent_node(devlink_rate, parent->parent)) { NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node."); return -EEXIST; } if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); if (err) return err; if (devlink_rate->parent) /* we're reassigning to other parent in this case */ refcount_dec(&devlink_rate->parent->refcnt); refcount_inc(&parent->refcnt); devlink_rate->parent = parent; } return 0; } static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, unsigned long *bitmap, struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_RATE_TC_ATTR_MAX + 1]; u8 tc_index; int err; err = nla_parse_nested(tb, DEVLINK_RATE_TC_ATTR_MAX, parent_nest, devlink_dl_rate_tc_bws_nl_policy, extack); if (err) return err; if (!tb[DEVLINK_RATE_TC_ATTR_INDEX]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, DEVLINK_RATE_TC_ATTR_INDEX); return -EINVAL; } tc_index = nla_get_u8(tb[DEVLINK_RATE_TC_ATTR_INDEX]); if (!tb[DEVLINK_RATE_TC_ATTR_BW]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, DEVLINK_RATE_TC_ATTR_BW); return -EINVAL; } if (test_and_set_bit(tc_index, bitmap)) { NL_SET_ERR_MSG_FMT(extack, "Duplicate traffic class index specified (%u)", tc_index); return -EINVAL; } tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_RATE_TC_ATTR_BW]); return 0; } static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate, struct genl_info *info) { DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {}; struct devlink *devlink = devlink_rate->devlink; const struct devlink_ops *ops = devlink->ops; u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {}; int rem, err = -EOPNOTSUPP, i; struct nlattr *attr; nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr, GENL_HDRLEN, rem) { err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap, info->extack); if (err) return err; } for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { if (!test_bit(i, bitmap)) { NL_SET_ERR_MSG_FMT(info->extack, "Bandwidth values must be specified for all %u traffic classes", DEVLINK_RATE_TCS_MAX); return -EINVAL; } } if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, tc_bw, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, tc_bw, info->extack); if (err) return err; memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); return 0; } static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) { struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; u32 priority; u32 weight; u64 rate; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_share = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_max = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); if (err) return err; devlink_rate->tx_priority = priority; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); if (err) return err; devlink_rate->tx_weight = weight; } nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, nla_parent); if (err) return err; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) { err = devlink_nl_rate_tc_bw_set(devlink_rate, info); if (err) return err; } return 0; } static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, struct genl_info *info, enum devlink_rate_type type) { struct nlattr **attrs = info->attrs; if (type == DEVLINK_RATE_TYPE_LEAF) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_leaf_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && !ops->rate_leaf_tc_bw_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TC_BWS], "TC bandwidth set isn't supported for the leafs"); return false; } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_node_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && !ops->rate_node_tc_bw_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TC_BWS], "TC bandwidth set isn't supported for the nodes"); return false; } } else { WARN(1, "Unknown type of rate object"); return false; } return true; } int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; const struct devlink_ops *ops; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); ops = devlink->ops; if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) return -EOPNOTSUPP; err = devlink_nl_rate_set(devlink_rate, ops, info); if (!err) devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return err; } int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; const struct devlink_ops *ops; int err; ops = devlink->ops; if (!ops || !ops->rate_node_new || !ops->rate_node_del) { NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported"); return -EOPNOTSUPP; } if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) return -EOPNOTSUPP; rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); if (!IS_ERR(rate_node)) return -EEXIST; else if (rate_node == ERR_PTR(-EINVAL)) return -EINVAL; rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); if (!rate_node) return -ENOMEM; rate_node->devlink = devlink; rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); if (!rate_node->name) { err = -ENOMEM; goto err_strdup; } err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); if (err) goto err_node_new; err = devlink_nl_rate_set(rate_node, ops, info); if (err) goto err_rate_set; refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return 0; err_rate_set: ops->rate_node_del(rate_node, rate_node->priv, info->extack); err_node_new: kfree(rate_node->name); err_strdup: kfree(rate_node); return err; } int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; int err; rate_node = devlink_rate_node_get_from_info(devlink, info); if (IS_ERR(rate_node)) return PTR_ERR(rate_node); if (refcount_read(&rate_node->refcnt) > 1) { NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); return -EBUSY; } devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); err = devlink->ops->rate_node_del(rate_node, rate_node->priv, info->extack); if (rate_node->parent) refcount_dec(&rate_node->parent->refcnt); list_del(&rate_node->list); kfree(rate_node->name); kfree(rate_node); return err; } int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (devlink_rate_is_node(devlink_rate)) { NL_SET_ERR_MSG(extack, "Rate node(s) exists."); return -EBUSY; } return 0; } /** * devl_rate_node_create - create devlink rate node * @devlink: devlink instance * @priv: driver private data * @node_name: name of the resulting node * @parent: parent devlink_rate struct * * Create devlink rate object of type node */ struct devlink_rate * devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, struct devlink_rate *parent) { struct devlink_rate *rate_node; rate_node = devlink_rate_node_get_by_name(devlink, node_name); if (!IS_ERR(rate_node)) return ERR_PTR(-EEXIST); rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); if (!rate_node) return ERR_PTR(-ENOMEM); if (parent) { rate_node->parent = parent; refcount_inc(&rate_node->parent->refcnt); } rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->devlink = devlink; rate_node->priv = priv; rate_node->name = kstrdup(node_name, GFP_KERNEL); if (!rate_node->name) { kfree(rate_node); return ERR_PTR(-ENOMEM); } refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return rate_node; } EXPORT_SYMBOL_GPL(devl_rate_node_create); /** * devl_rate_leaf_create - create devlink rate leaf * @devlink_port: devlink port object to create rate object on * @priv: driver private data * @parent: parent devlink_rate struct * * Create devlink rate object of type leaf on provided @devlink_port. */ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, struct devlink_rate *parent) { struct devlink *devlink = devlink_port->devlink; struct devlink_rate *devlink_rate; devl_assert_locked(devlink_port->devlink); if (WARN_ON(devlink_port->devlink_rate)) return -EBUSY; devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); if (!devlink_rate) return -ENOMEM; if (parent) { devlink_rate->parent = parent; refcount_inc(&devlink_rate->parent->refcnt); } devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; devlink_rate->devlink = devlink; devlink_rate->devlink_port = devlink_port; devlink_rate->priv = priv; list_add_tail(&devlink_rate->list, &devlink->rate_list); devlink_port->devlink_rate = devlink_rate; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_rate_leaf_create); /** * devl_rate_leaf_destroy - destroy devlink rate leaf * * @devlink_port: devlink port linked to the rate object * * Destroy the devlink rate object of type leaf on provided @devlink_port. */ void devl_rate_leaf_destroy(struct devlink_port *devlink_port) { struct devlink_rate *devlink_rate = devlink_port->devlink_rate; devl_assert_locked(devlink_port->devlink); if (!devlink_rate) return; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); if (devlink_rate->parent) refcount_dec(&devlink_rate->parent->refcnt); list_del(&devlink_rate->list); devlink_port->devlink_rate = NULL; kfree(devlink_rate); } EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); /** * devl_rate_nodes_destroy - destroy all devlink rate nodes on device * @devlink: devlink instance * * Unset parent for all rate objects and destroy all rate nodes * on specified device. */ void devl_rate_nodes_destroy(struct devlink *devlink) { const struct devlink_ops *ops = devlink->ops; struct devlink_rate *devlink_rate, *tmp; devl_assert_locked(devlink); list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (!devlink_rate->parent) continue; if (devlink_rate_is_leaf(devlink_rate)) ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); else if (devlink_rate_is_node(devlink_rate)) ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); refcount_dec(&devlink_rate->parent->refcnt); devlink_rate->parent = NULL; } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); list_del(&devlink_rate->list); kfree(devlink_rate->name); kfree(devlink_rate); } } } EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
7 7 7 7 7 6 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 // SPDX-License-Identifier: GPL-2.0 #include <linux/tcp.h> #include <net/tcp.h> static u32 tcp_rack_reo_wnd(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (!tp->reord_seen) { /* If reordering has not been observed, be aggressive during * the recovery or starting the recovery by DUPACK threshold. */ if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery) return 0; if (tp->sacked_out >= tp->reordering && !(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & TCP_RACK_NO_DUPTHRESH)) return 0; } /* To be more reordering resilient, allow min_rtt/4 settling delay. * Use min_rtt instead of the smoothed RTT because reordering is * often a path property and less related to queuing or delayed ACKs. * Upon receiving DSACKs, linearly increase the window up to the * smoothed RTT. */ return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps, tp->srtt_us >> 3); } s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) { return tp->rack.rtt_us + reo_wnd - tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb)); } /* RACK loss detection (IETF RFC8985): * * Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK * but they look at different metrics: * * dupthresh: 3 OOO packets delivered (packet count) * FACK: sequence delta to highest sacked sequence (sequence space) * RACK: sent time delta to the latest delivered packet (time domain) * * The advantage of RACK is it applies to both original and retransmitted * packet and therefore is robust against tail losses. Another advantage * is being more resilient to reordering by simply allowing some * "settling delay", instead of tweaking the dupthresh. * * When tcp_rack_detect_loss() detects some packets are lost and we * are not already in the CA_Recovery state, either tcp_rack_reo_timeout() * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will * make us enter the CA_Recovery state. */ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; reo_wnd = tcp_rack_reo_wnd(sk); list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); s32 remaining; /* Skip ones marked lost but not yet retransmitted */ if ((scb->sacked & TCPCB_LOST) && !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; if (!tcp_skb_sent_after(tp->rack.mstamp, tcp_skb_timestamp_us(skb), tp->rack.end_seq, scb->end_seq)) break; /* A packet is lost if it has not been s/acked beyond * the recent RTT plus the reordering window. */ remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd); if (remaining <= 0) { tcp_mark_skb_lost(sk, skb); list_del_init(&skb->tcp_tsorted_anchor); } else { /* Record maximum wait time */ *reo_timeout = max_t(u32, *reo_timeout, remaining); } } } bool tcp_rack_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout; if (!tp->rack.advanced) return false; /* Reset the advanced flag to avoid unnecessary queue scanning */ tp->rack.advanced = 0; tcp_rack_detect_loss(sk, &timeout); if (timeout) { timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US); inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, timeout, inet_csk(sk)->icsk_rto); } return !!timeout; } /* Record the most recently (re)sent time among the (s)acked packets * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from * draft-cheng-tcpm-rack-00.txt */ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time) { u32 rtt_us; rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. * * If the original is lost, there is no ambiguity. Otherwise * we assume the original can be delayed up to aRTT + min_rtt. * the aRTT term is bounded by the fast recovery or timeout, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ return; } tp->rack.advanced = 1; tp->rack.rtt_us = rtt_us; if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, end_seq, tp->rack.end_seq)) { tp->rack.mstamp = xmit_time; tp->rack.end_seq = end_seq; } } /* We have waited long enough to accommodate reordering. Mark the expired * packets lost and retransmit them. */ void tcp_rack_reo_timeout(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout, prior_inflight; u32 lost = tp->lost; prior_inflight = tcp_packets_in_flight(tp); tcp_rack_detect_loss(sk, &timeout); if (prior_inflight != tcp_packets_in_flight(tp)) { if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { tcp_enter_recovery(sk, false); if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_cwnd_reduction(sk, 1, tp->lost - lost, 0); } tcp_xmit_retransmit_queue(sk); } if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) tcp_rearm_rto(sk); } /* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. * * If a DSACK is received that seems like it may have been due to reordering * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded * by srtt), since there is possibility that spurious retransmission was * due to reordering delay longer than reo_wnd. * * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) * no. of successful recoveries (accounts for full DSACK-based loss * recovery undo). After that, reset it to default (min_rtt/4). * * At max, reo_wnd is incremented only once per rtt. So that the new * DSACK on which we are reacting, is due to the spurious retx (approx) * after the reo_wnd has been updated last time. * * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than * absolute value to account for change in rtt. */ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & TCP_RACK_STATIC_REO_WND) || !rs->prior_delivered) return; /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ if (before(rs->prior_delivered, tp->rack.last_delivered)) tp->rack.dsack_seen = 0; /* Adjust the reo_wnd if update is pending */ if (tp->rack.dsack_seen) { tp->rack.reo_wnd_steps = min_t(u32, 0xFF, tp->rack.reo_wnd_steps + 1); tp->rack.dsack_seen = 0; tp->rack.last_delivered = tp->delivered; tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; } else if (!tp->rack.reo_wnd_persist) { tp->rack.reo_wnd_steps = 1; } } /* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits * the next unacked packet upon receiving * a) three or more DUPACKs to start the fast recovery * b) an ACK acknowledging new data during the fast recovery. */ void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced) { const u8 state = inet_csk(sk)->icsk_ca_state; struct tcp_sock *tp = tcp_sk(sk); if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) || (state == TCP_CA_Recovery && snd_una_advanced)) { struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 mss; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) return; mss = tcp_skb_mss(skb); if (tcp_skb_pcount(skb) > 1 && skb->len > mss) tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, mss, mss, GFP_ATOMIC); tcp_mark_skb_lost(sk, skb); } }
21 20 20 20 19 18 18 3 3 17 3 17 21 1 1 1 1 1 1 1 1 1 1 1 1 1 25 25 7 7 7 2 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/unaligned.h> #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/tcp.h> struct nft_exthdr { u8 type; u8 offset; u8 len; u8 op; u8 dreg; u8 sreg; u8 flags; }; static unsigned int optlen(const u8 *opt, unsigned int offset) { /* Beware zero-length options: make finite progress */ if (opt[offset] <= TCPOPT_NOP || opt[offset + 1] == 0) return 1; else return opt[offset + 1]; } static int nft_skb_copy_to_reg(const struct sk_buff *skb, int offset, u32 *dest, unsigned int len) { if (len % NFT_REG32_SIZE) dest[len / NFT_REG32_SIZE] = 0; return skb_copy_bits(skb, offset, dest, len); } static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; unsigned int offset = 0; int err; if (pkt->skb->protocol != htons(ETH_P_IPV6)) goto err; err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, err >= 0); return; } else if (err < 0) { goto err; } offset += priv->offset; if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: regs->verdict.code = NFT_BREAK; } /* find the offset to specified option. * * If target header is found, its offset is set in *offset and return option * number. Otherwise, return negative error. * * If the first fragment doesn't contain the End of Options it is considered * invalid. */ static int ipv4_find_option(struct net *net, struct sk_buff *skb, unsigned int *offset, int target) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; struct iphdr *iph, _iph; bool found = false; __be32 info; int optlen; iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (!iph) return -EBADMSG; optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); if (optlen <= 0) return -ENOENT; memset(opt, 0, sizeof(struct ip_options)); /* Copy the options since __ip_options_compile() modifies * the options. */ if (skb_copy_bits(skb, sizeof(struct iphdr), opt->__data, optlen)) return -EBADMSG; opt->optlen = optlen; if (__ip_options_compile(net, opt, NULL, &info)) return -EBADMSG; switch (target) { case IPOPT_SSRR: case IPOPT_LSRR: if (!opt->srr) break; found = target == IPOPT_SSRR ? opt->is_strictroute : !opt->is_strictroute; if (found) *offset = opt->srr; break; case IPOPT_RR: if (!opt->rr) break; *offset = opt->rr; found = true; break; case IPOPT_RA: if (!opt->router_alert) break; *offset = opt->router_alert; found = true; break; default: return -EOPNOTSUPP; } return found ? target : -ENOENT; } static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; struct sk_buff *skb = pkt->skb; unsigned int offset; int err; if (skb->protocol != htons(ETH_P_IP)) goto err; err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type); if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, err >= 0); return; } else if (err < 0) { goto err; } offset += priv->offset; if (nft_skb_copy_to_reg(pkt->skb, offset, dest, priv->len) < 0) goto err; return; err: regs->verdict.code = NFT_BREAK; } static void * nft_tcp_header_pointer(const struct nft_pktinfo *pkt, unsigned int len, void *buffer, unsigned int *tcphdr_len) { struct tcphdr *tcph; if (pkt->tprot != IPPROTO_TCP || pkt->fragoff) return NULL; tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(*tcph), buffer); if (!tcph) return NULL; *tcphdr_len = __tcp_hdrlen(tcph); if (*tcphdr_len < sizeof(*tcph) || *tcphdr_len > len) return NULL; return skb_header_pointer(pkt->skb, nft_thoff(pkt), *tcphdr_len, buffer); } static void nft_exthdr_tcp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; struct nft_exthdr *priv = nft_expr_priv(expr); unsigned int i, optl, tcphdr_len, offset; u32 *dest = &regs->data[priv->dreg]; struct tcphdr *tcph; u8 *opt; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) goto err; opt = (u8 *)tcph; for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { optl = optlen(opt, i); if (priv->type != opt[i]) continue; if (i + optl > tcphdr_len || priv->len + priv->offset > optl) goto err; offset = i + priv->offset; if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, 1); } else { if (priv->len % NFT_REG32_SIZE) dest[priv->len / NFT_REG32_SIZE] = 0; memcpy(dest, opt + offset, priv->len); } return; } err: if (priv->flags & NFT_EXTHDR_F_PRESENT) *dest = 0; else regs->verdict.code = NFT_BREAK; } static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; struct nft_exthdr *priv = nft_expr_priv(expr); unsigned int i, optl, tcphdr_len, offset; struct tcphdr *tcph; u8 *opt; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) goto err; if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) goto err; tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt)); opt = (u8 *)tcph; for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { union { __be16 v16; __be32 v32; } old, new; optl = optlen(opt, i); if (priv->type != opt[i]) continue; if (i + optl > tcphdr_len || priv->len + priv->offset > optl) goto err; offset = i + priv->offset; switch (priv->len) { case 2: old.v16 = (__force __be16)get_unaligned((u16 *)(opt + offset)); new.v16 = (__force __be16)nft_reg_load16( &regs->data[priv->sreg]); switch (priv->type) { case TCPOPT_MSS: /* increase can cause connection to stall */ if (ntohs(old.v16) <= ntohs(new.v16)) return; break; } if (old.v16 == new.v16) return; put_unaligned(new.v16, (__be16*)(opt + offset)); inet_proto_csum_replace2(&tcph->check, pkt->skb, old.v16, new.v16, false); break; case 4: new.v32 = nft_reg_load_be32(&regs->data[priv->sreg]); old.v32 = (__force __be32)get_unaligned((u32 *)(opt + offset)); if (old.v32 == new.v32) return; put_unaligned(new.v32, (__be32*)(opt + offset)); inet_proto_csum_replace4(&tcph->check, pkt->skb, old.v32, new.v32, false); break; default: WARN_ON_ONCE(1); break; } return; } return; err: regs->verdict.code = NFT_BREAK; } static void nft_exthdr_tcp_strip_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; struct nft_exthdr *priv = nft_expr_priv(expr); unsigned int i, tcphdr_len, optl; struct tcphdr *tcph; u8 *opt; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) goto err; if (skb_ensure_writable(pkt->skb, nft_thoff(pkt) + tcphdr_len)) goto drop; tcph = (struct tcphdr *)(pkt->skb->data + nft_thoff(pkt)); opt = (u8 *)tcph; for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { unsigned int j; optl = optlen(opt, i); if (priv->type != opt[i]) continue; if (i + optl > tcphdr_len) goto drop; for (j = 0; j < optl; ++j) { u16 n = TCPOPT_NOP; u16 o = opt[i+j]; if ((i + j) % 2 == 0) { o <<= 8; n <<= 8; } inet_proto_csum_replace2(&tcph->check, pkt->skb, htons(o), htons(n), false); } memset(opt + i, TCPOPT_NOP, optl); return; } /* option not found, continue. This allows to do multiple * option removals per rule. */ return; err: regs->verdict.code = NFT_BREAK; return; drop: /* can't remove, no choice but to drop */ regs->verdict.code = NF_DROP; } static void nft_exthdr_sctp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { unsigned int offset = nft_thoff(pkt) + sizeof(struct sctphdr); struct nft_exthdr *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; if (pkt->tprot != IPPROTO_SCTP) goto err; do { sch = skb_header_pointer(pkt->skb, offset, sizeof(_sch), &_sch); if (!sch || !sch->length) break; if (sch->type == priv->type) { if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, true); return; } if (priv->offset + priv->len > ntohs(sch->length) || offset + ntohs(sch->length) > pkt->skb->len) break; if (nft_skb_copy_to_reg(pkt->skb, offset + priv->offset, dest, priv->len) < 0) break; return; } offset += SCTP_PAD4(ntohs(sch->length)); } while (offset < pkt->skb->len); err: if (priv->flags & NFT_EXTHDR_F_PRESENT) nft_reg_store8(dest, false); else regs->verdict.code = NFT_BREAK; } #ifdef CONFIG_NFT_EXTHDR_DCCP static void nft_exthdr_dccp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); unsigned int thoff, dataoff, optoff, optlen, i; u32 *dest = &regs->data[priv->dreg]; const struct dccp_hdr *dh; struct dccp_hdr _dh; if (pkt->tprot != IPPROTO_DCCP || pkt->fragoff) goto err; thoff = nft_thoff(pkt); dh = skb_header_pointer(pkt->skb, thoff, sizeof(_dh), &_dh); if (!dh) goto err; dataoff = dh->dccph_doff * sizeof(u32); optoff = __dccp_hdr_len(dh); if (dataoff <= optoff) goto err; optlen = dataoff - optoff; for (i = 0; i < optlen; ) { /* Options 0 (DCCPO_PADDING) - 31 (DCCPO_MAX_RESERVED) are 1B in * the length; the remaining options are at least 2B long. In * all cases, the first byte contains the option type. In * multi-byte options, the second byte contains the option * length, which must be at least two: 1 for the type plus 1 for * the length plus 0-253 for any following option data. We * aren't interested in the option data, only the type and the * length, so we don't need to read more than two bytes at a * time. */ unsigned int buflen = optlen - i; u8 buf[2], *bufp; u8 type, len; if (buflen > sizeof(buf)) buflen = sizeof(buf); bufp = skb_header_pointer(pkt->skb, thoff + optoff + i, buflen, &buf); if (!bufp) goto err; type = bufp[0]; if (type == priv->type) { nft_reg_store8(dest, 1); return; } if (type <= DCCPO_MAX_RESERVED) { i++; continue; } if (buflen < 2) goto err; len = bufp[1]; if (len < 2) goto err; i += len; } err: *dest = 0; } #endif static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, [NFTA_EXTHDR_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, [NFTA_EXTHDR_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_EXTHDR_SREG] = { .type = NLA_U32 }, }; static int nft_exthdr_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6; int err; if (!tb[NFTA_EXTHDR_DREG] || !tb[NFTA_EXTHDR_TYPE] || !tb[NFTA_EXTHDR_OFFSET] || !tb[NFTA_EXTHDR_LEN]) return -EINVAL; err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); if (err < 0) return err; err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len); if (err < 0) return err; if (tb[NFTA_EXTHDR_FLAGS]) { err = nft_parse_u32_check(tb[NFTA_EXTHDR_FLAGS], U8_MAX, &flags); if (err < 0) return err; if (flags & ~NFT_EXTHDR_F_PRESENT) return -EINVAL; } if (tb[NFTA_EXTHDR_OP]) { err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op); if (err < 0) return err; } priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; priv->flags = flags; priv->op = op; return nft_parse_register_store(ctx, tb[NFTA_EXTHDR_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, priv->len); } static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6; int err; if (!tb[NFTA_EXTHDR_SREG] || !tb[NFTA_EXTHDR_TYPE] || !tb[NFTA_EXTHDR_OFFSET] || !tb[NFTA_EXTHDR_LEN]) return -EINVAL; if (tb[NFTA_EXTHDR_DREG] || tb[NFTA_EXTHDR_FLAGS]) return -EINVAL; err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); if (err < 0) return err; err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len); if (err < 0) return err; if (offset < 2) return -EOPNOTSUPP; switch (len) { case 2: break; case 4: break; default: return -EOPNOTSUPP; } err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op); if (err < 0) return err; priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; priv->flags = flags; priv->op = op; return nft_parse_register_load(ctx, tb[NFTA_EXTHDR_SREG], &priv->sreg, priv->len); } static int nft_exthdr_tcp_strip_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); if (tb[NFTA_EXTHDR_SREG] || tb[NFTA_EXTHDR_DREG] || tb[NFTA_EXTHDR_FLAGS] || tb[NFTA_EXTHDR_OFFSET] || tb[NFTA_EXTHDR_LEN]) return -EINVAL; if (!tb[NFTA_EXTHDR_TYPE]) return -EINVAL; priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->op = NFT_EXTHDR_OP_TCPOPT; return 0; } static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); int err = nft_exthdr_init(ctx, expr, tb); if (err < 0) return err; switch (priv->type) { case IPOPT_SSRR: case IPOPT_LSRR: case IPOPT_RR: case IPOPT_RA: break; default: return -EOPNOTSUPP; } return 0; } #ifdef CONFIG_NFT_EXTHDR_DCCP static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); int err = nft_exthdr_init(ctx, expr, tb); if (err < 0) return err; if (!(priv->flags & NFT_EXTHDR_F_PRESENT)) return -EOPNOTSUPP; return 0; } #endif static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_OFFSET, htonl(priv->offset))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags))) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_OP, htonl(priv->op))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_EXTHDR_DREG, priv->dreg)) return -1; return nft_exthdr_dump_common(skb, priv); } static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_EXTHDR_SREG, priv->sreg)) return -1; return nft_exthdr_dump_common(skb, priv); } static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); return nft_exthdr_dump_common(skb, priv); } static bool nft_exthdr_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_exthdr *priv = nft_expr_priv(expr); const struct nft_exthdr *exthdr; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } exthdr = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->type != exthdr->type || priv->op != exthdr->op || priv->flags != exthdr->flags || priv->offset != exthdr->offset || priv->len != exthdr->len) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_ipv6_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_ipv4_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_ipv4_eval, .init = nft_exthdr_ipv4_init, .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_tcp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_tcp_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_tcp_set_eval, .init = nft_exthdr_tcp_set_init, .dump = nft_exthdr_dump_set, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_exthdr_tcp_strip_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_tcp_strip_eval, .init = nft_exthdr_tcp_strip_init, .dump = nft_exthdr_dump_strip, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_exthdr_sctp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_sctp_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; #ifdef CONFIG_NFT_EXTHDR_DCCP static const struct nft_expr_ops nft_exthdr_dccp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), .eval = nft_exthdr_dccp_eval, .init = nft_exthdr_dccp_init, .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; #endif static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { u32 op; if (!tb[NFTA_EXTHDR_OP]) return &nft_exthdr_ipv6_ops; if (tb[NFTA_EXTHDR_SREG] && tb[NFTA_EXTHDR_DREG]) return ERR_PTR(-EOPNOTSUPP); op = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OP])); switch (op) { case NFT_EXTHDR_OP_TCPOPT: if (tb[NFTA_EXTHDR_SREG]) return &nft_exthdr_tcp_set_ops; if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_tcp_ops; return &nft_exthdr_tcp_strip_ops; case NFT_EXTHDR_OP_IPV6: if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv6_ops; break; case NFT_EXTHDR_OP_IPV4: if (ctx->family != NFPROTO_IPV6) { if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv4_ops; } break; case NFT_EXTHDR_OP_SCTP: if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_sctp_ops; break; #ifdef CONFIG_NFT_EXTHDR_DCCP case NFT_EXTHDR_OP_DCCP: if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_dccp_ops; break; #endif } return ERR_PTR(-EOPNOTSUPP); } struct nft_expr_type nft_exthdr_type __read_mostly = { .name = "exthdr", .select_ops = nft_exthdr_select_ops, .policy = nft_exthdr_policy, .maxattr = NFTA_EXTHDR_MAX, .owner = THIS_MODULE, };
23 23 22 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 // SPDX-License-Identifier: GPL-2.0-only /* * NUMA support, based on the x86 implementation. * * Copyright (C) 2015 Cavium Inc. * Author: Ganapatrao Kulkarni <gkulkarni@cavium.com> */ #define pr_fmt(fmt) "NUMA: " fmt #include <linux/acpi.h> #include <linux/memblock.h> #include <linux/module.h> #include <linux/of.h> #include <linux/numa_memblks.h> #include <asm/sections.h> static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; bool numa_off; static __init int numa_parse_early_param(char *opt) { if (!opt) return -EINVAL; if (str_has_prefix(opt, "off")) numa_off = true; if (!strncmp(opt, "fake=", 5)) return numa_emu_cmdline(opt + 5); return 0; } early_param("numa", numa_parse_early_param); cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); #ifdef CONFIG_DEBUG_PER_CPU_MAPS /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ const struct cpumask *cpumask_of_node(int node) { if (node == NUMA_NO_NODE) return cpu_all_mask; if (WARN_ON(node < 0 || node >= nr_node_ids)) return cpu_none_mask; if (WARN_ON(node_to_cpumask_map[node] == NULL)) return cpu_online_mask; return node_to_cpumask_map[node]; } EXPORT_SYMBOL(cpumask_of_node); #endif #ifndef CONFIG_NUMA_EMU static void numa_update_cpu(unsigned int cpu, bool remove) { int nid = cpu_to_node(cpu); if (nid == NUMA_NO_NODE) return; if (remove) cpumask_clear_cpu(cpu, node_to_cpumask_map[nid]); else cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); } void numa_add_cpu(unsigned int cpu) { numa_update_cpu(cpu, false); } void numa_remove_cpu(unsigned int cpu) { numa_update_cpu(cpu, true); } #endif void numa_clear_node(unsigned int cpu) { numa_remove_cpu(cpu); set_cpu_numa_node(cpu, NUMA_NO_NODE); } /* * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * * Note: cpumask_of_node() is not valid until after this is done. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) */ static void __init setup_node_to_cpumask_map(void) { int node; /* setup nr_node_ids if not done yet */ if (nr_node_ids == MAX_NUMNODES) setup_nr_node_ids(); /* allocate and clear the mapping */ for (node = 0; node < nr_node_ids; node++) { alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); cpumask_clear(node_to_cpumask_map[node]); } /* cpumask_of_node() will now work */ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } /* * Set the cpu to node and mem mapping */ void numa_store_cpu_info(unsigned int cpu) { set_cpu_numa_node(cpu, cpu_to_node_map[cpu]); } void __init early_map_cpu_to_node(unsigned int cpu, int nid) { /* fallback to node 0 */ if (nid < 0 || nid >= MAX_NUMNODES || numa_off) nid = 0; cpu_to_node_map[cpu] = nid; /* * We should set the numa node of cpu0 as soon as possible, because it * has already been set up online before. cpu_to_node(0) will soon be * called. */ if (!cpu) set_cpu_numa_node(cpu, nid); } #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); int early_cpu_to_node(int cpu) { return cpu_to_node_map[cpu]; } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { return node_distance(early_cpu_to_node(from), early_cpu_to_node(to)); } void __init setup_per_cpu_areas(void) { unsigned long delta; unsigned int cpu; int rc = -EINVAL; if (pcpu_chosen_fc != PCPU_FC_PAGE) { /* * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, pcpu_cpu_distance, early_cpu_to_node); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); #endif } #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, early_cpu_to_node); #endif if (rc < 0) panic("Failed to initialize percpu areas (err=%d).", rc); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif /* * Initialize NODE_DATA for a node on the local memory */ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) { if (start_pfn >= end_pfn) pr_info("Initmem setup node %d [<memory-less node>]\n", nid); alloc_node_data(nid); NODE_DATA(nid)->node_id = nid; NODE_DATA(nid)->node_start_pfn = start_pfn; NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; } static int __init numa_register_nodes(void) { int nid; /* Check the validity of the memblock/node mapping */ if (!memblock_validate_numa_coverage(0)) return -EINVAL; /* Finally register nodes. */ for_each_node_mask(nid, numa_nodes_parsed) { unsigned long start_pfn, end_pfn; get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); setup_node_data(nid, start_pfn, end_pfn); node_set_online(nid); } /* Setup online nodes to actual nodes*/ node_possible_map = numa_nodes_parsed; return 0; } static int __init numa_init(int (*init_func)(void)) { int ret; nodes_clear(numa_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); ret = numa_memblks_init(init_func, /* memblock_force_top_down */ false); if (ret < 0) goto out_free_distance; if (nodes_empty(numa_nodes_parsed)) { pr_info("No NUMA configuration found\n"); ret = -EINVAL; goto out_free_distance; } ret = numa_register_nodes(); if (ret < 0) goto out_free_distance; setup_node_to_cpumask_map(); return 0; out_free_distance: numa_reset_distance(); return ret; } /** * dummy_numa_init() - Fallback dummy NUMA init * * Used if there's no underlying NUMA architecture, NUMA initialization * fails, or NUMA is disabled on the command line. * * Must online at least one node (node 0) and add memory blocks that cover all * allowed memory. It is unlikely that this function fails. * * Return: 0 on success, -errno on failure. */ static int __init dummy_numa_init(void) { phys_addr_t start = memblock_start_of_DRAM(); phys_addr_t end = memblock_end_of_DRAM() - 1; int ret; if (numa_off) pr_info("NUMA disabled\n"); /* Forced off on command line. */ pr_info("Faking a node at [mem %pap-%pap]\n", &start, &end); ret = numa_add_memblk(0, start, end + 1); if (ret) { pr_err("NUMA init failed\n"); return ret; } node_set(0, numa_nodes_parsed); numa_off = true; return 0; } #ifdef CONFIG_ACPI_NUMA static int __init arch_acpi_numa_init(void) { int ret; ret = acpi_numa_init(); if (ret) { pr_debug("Failed to initialise from firmware\n"); return ret; } return srat_disabled() ? -EINVAL : 0; } #else static int __init arch_acpi_numa_init(void) { return -EOPNOTSUPP; } #endif /** * arch_numa_init() - Initialize NUMA * * Try each configured NUMA initialization method until one succeeds. The * last fallback is dummy single node config encompassing whole memory. */ void __init arch_numa_init(void) { if (!numa_off) { if (!acpi_disabled && !numa_init(arch_acpi_numa_init)) return; if (acpi_disabled && !numa_init(of_numa_init)) return; } numa_init(dummy_numa_init); } #ifdef CONFIG_NUMA_EMU void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, unsigned int nr_emu_nids) { int i, j; /* * Transform cpu_to_node_map table to use emulated nids by * reverse-mapping phys_nid. The maps should always exist but fall * back to zero just in case. */ for (i = 0; i < ARRAY_SIZE(cpu_to_node_map); i++) { if (cpu_to_node_map[i] == NUMA_NO_NODE) continue; for (j = 0; j < nr_emu_nids; j++) if (cpu_to_node_map[i] == emu_nid_to_phys[j]) break; cpu_to_node_map[i] = j < nr_emu_nids ? j : 0; } } u64 __init numa_emu_dma_end(void) { return memblock_start_of_DRAM() + SZ_4G; } void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable) { struct cpumask *mask; if (node == NUMA_NO_NODE) return; mask = node_to_cpumask_map[node]; if (!cpumask_available(mask)) { pr_err("node_to_cpumask_map[%i] NULL\n", node); dump_stack(); return; } if (enable) cpumask_set_cpu(cpu, mask); else cpumask_clear_cpu(cpu, mask); pr_debug("%s cpu %d node %d: mask now %*pbl\n", enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, cpumask_pr_args(mask)); } #endif /* CONFIG_NUMA_EMU */
12 11 1 1 12 1 12 10 10 11 9 11 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 // SPDX-License-Identifier: GPL-2.0-or-later /* * linux/ipc/msgutil.c * Copyright (C) 1999, 2004 Manfred Spraul */ #include <linux/spinlock.h> #include <linux/init.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/ipc.h> #include <linux/msg.h> #include <linux/ipc_namespace.h> #include <linux/utsname.h> #include <linux/proc_ns.h> #include <linux/uaccess.h> #include <linux/sched.h> #include <linux/nstree.h> #include "util.h" DEFINE_SPINLOCK(mq_lock); /* * The next 2 defines are here bc this is the only file * compiled when either CONFIG_SYSVIPC and CONFIG_POSIX_MQUEUE * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { .ns = NS_COMMON_INIT(init_ipc_ns), .user_ns = &init_user_ns, }; struct msg_msgseg { struct msg_msgseg *next; /* the next part of the message follows immediately */ }; #define DATALEN_MSG ((size_t)PAGE_SIZE-sizeof(struct msg_msg)) #define DATALEN_SEG ((size_t)PAGE_SIZE-sizeof(struct msg_msgseg)) static kmem_buckets *msg_buckets __ro_after_init; static int __init init_msg_buckets(void) { msg_buckets = kmem_buckets_create("msg_msg", SLAB_ACCOUNT, sizeof(struct msg_msg), DATALEN_MSG, NULL); return 0; } subsys_initcall(init_msg_buckets); static struct msg_msg *alloc_msg(size_t len) { struct msg_msg *msg; struct msg_msgseg **pseg; size_t alen; alen = min(len, DATALEN_MSG); msg = kmem_buckets_alloc(msg_buckets, sizeof(*msg) + alen, GFP_KERNEL); if (msg == NULL) return NULL; msg->next = NULL; msg->security = NULL; len -= alen; pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; cond_resched(); alen = min(len, DATALEN_SEG); seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT); if (seg == NULL) goto out_err; *pseg = seg; seg->next = NULL; pseg = &seg->next; len -= alen; } return msg; out_err: free_msg(msg); return NULL; } struct msg_msg *load_msg(const void __user *src, size_t len) { struct msg_msg *msg; struct msg_msgseg *seg; int err = -EFAULT; size_t alen; msg = alloc_msg(len); if (msg == NULL) return ERR_PTR(-ENOMEM); alen = min(len, DATALEN_MSG); if (copy_from_user(msg + 1, src, alen)) goto out_err; for (seg = msg->next; seg != NULL; seg = seg->next) { len -= alen; src = (char __user *)src + alen; alen = min(len, DATALEN_SEG); if (copy_from_user(seg + 1, src, alen)) goto out_err; } err = security_msg_msg_alloc(msg); if (err) goto out_err; return msg; out_err: free_msg(msg); return ERR_PTR(err); } #ifdef CONFIG_CHECKPOINT_RESTORE struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst) { struct msg_msgseg *dst_pseg, *src_pseg; size_t len = src->m_ts; size_t alen; if (src->m_ts > dst->m_ts) return ERR_PTR(-EINVAL); alen = min(len, DATALEN_MSG); memcpy(dst + 1, src + 1, alen); for (dst_pseg = dst->next, src_pseg = src->next; src_pseg != NULL; dst_pseg = dst_pseg->next, src_pseg = src_pseg->next) { len -= alen; alen = min(len, DATALEN_SEG); memcpy(dst_pseg + 1, src_pseg + 1, alen); } dst->m_type = src->m_type; dst->m_ts = src->m_ts; return dst; } #else struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst) { return ERR_PTR(-ENOSYS); } #endif int store_msg(void __user *dest, struct msg_msg *msg, size_t len) { size_t alen; struct msg_msgseg *seg; alen = min(len, DATALEN_MSG); if (copy_to_user(dest, msg + 1, alen)) return -1; for (seg = msg->next; seg != NULL; seg = seg->next) { len -= alen; dest = (char __user *)dest + alen; alen = min(len, DATALEN_SEG); if (copy_to_user(dest, seg + 1, alen)) return -1; } return 0; } void free_msg(struct msg_msg *msg) { struct msg_msgseg *seg; security_msg_msg_free(msg); seg = msg->next; kfree(msg); while (seg != NULL) { struct msg_msgseg *tmp = seg->next; cond_resched(); kfree(seg); seg = tmp; } }
9 2 3 1 3 1 3 9 9 9 9 9 9 9 9 1 1 1 1 1 8 2 8 8 8 3 3 3 3 3 3 3 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 /* * linux/drivers/video/modedb.c -- Standard video mode database management * * Copyright (C) 1999 Geert Uytterhoeven * * 2001 - Documented with DocBook * - Brad Douglas <brad@neruo.com> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive for * more details. */ #include <linux/export.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/fb.h> #include <linux/kernel.h> #undef DEBUG #define name_matches(v, s, l) \ ((v).name && !strncmp((s), (v).name, (l)) && strlen((v).name) == (l)) #define res_matches(v, x, y) \ ((v).xres == (x) && (v).yres == (y)) #ifdef DEBUG #define DPRINTK(fmt, args...) printk("modedb %s: " fmt, __func__ , ## args) #else #define DPRINTK(fmt, args...) #endif /* * Standard video mode definitions (taken from XFree86) */ static const struct fb_videomode modedb[] = { /* 640x400 @ 70 Hz, 31.5 kHz hsync */ { NULL, 70, 640, 400, 39721, 40, 24, 39, 9, 96, 2, 0, FB_VMODE_NONINTERLACED }, /* 640x480 @ 60 Hz, 31.5 kHz hsync */ { NULL, 60, 640, 480, 39721, 40, 24, 32, 11, 96, 2, 0, FB_VMODE_NONINTERLACED }, /* 800x600 @ 56 Hz, 35.15 kHz hsync */ { NULL, 56, 800, 600, 27777, 128, 24, 22, 1, 72, 2, 0, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 87 Hz interlaced, 35.5 kHz hsync */ { NULL, 87, 1024, 768, 22271, 56, 24, 33, 8, 160, 8, 0, FB_VMODE_INTERLACED }, /* 640x400 @ 85 Hz, 37.86 kHz hsync */ { NULL, 85, 640, 400, 31746, 96, 32, 41, 1, 64, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 640x480 @ 72 Hz, 36.5 kHz hsync */ { NULL, 72, 640, 480, 31746, 144, 40, 30, 8, 40, 3, 0, FB_VMODE_NONINTERLACED }, /* 640x480 @ 75 Hz, 37.50 kHz hsync */ { NULL, 75, 640, 480, 31746, 120, 16, 16, 1, 64, 3, 0, FB_VMODE_NONINTERLACED }, /* 800x600 @ 60 Hz, 37.8 kHz hsync */ { NULL, 60, 800, 600, 25000, 88, 40, 23, 1, 128, 4, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 640x480 @ 85 Hz, 43.27 kHz hsync */ { NULL, 85, 640, 480, 27777, 80, 56, 25, 1, 56, 3, 0, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 89 Hz interlaced, 44 kHz hsync */ { NULL, 89, 1152, 864, 15384, 96, 16, 110, 1, 216, 10, 0, FB_VMODE_INTERLACED }, /* 800x600 @ 72 Hz, 48.0 kHz hsync */ { NULL, 72, 800, 600, 20000, 64, 56, 23, 37, 120, 6, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 60 Hz, 48.4 kHz hsync */ { NULL, 60, 1024, 768, 15384, 168, 8, 29, 3, 144, 6, 0, FB_VMODE_NONINTERLACED }, /* 640x480 @ 100 Hz, 53.01 kHz hsync */ { NULL, 100, 640, 480, 21834, 96, 32, 36, 8, 96, 6, 0, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 60 Hz, 53.5 kHz hsync */ { NULL, 60, 1152, 864, 11123, 208, 64, 16, 4, 256, 8, 0, FB_VMODE_NONINTERLACED }, /* 800x600 @ 85 Hz, 55.84 kHz hsync */ { NULL, 85, 800, 600, 16460, 160, 64, 36, 16, 64, 5, 0, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 70 Hz, 56.5 kHz hsync */ { NULL, 70, 1024, 768, 13333, 144, 24, 29, 3, 136, 6, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 87 Hz interlaced, 51 kHz hsync */ { NULL, 87, 1280, 1024, 12500, 56, 16, 128, 1, 216, 12, 0, FB_VMODE_INTERLACED }, /* 800x600 @ 100 Hz, 64.02 kHz hsync */ { NULL, 100, 800, 600, 14357, 160, 64, 30, 4, 64, 6, 0, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 76 Hz, 62.5 kHz hsync */ { NULL, 76, 1024, 768, 11764, 208, 8, 36, 16, 120, 3, 0, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 70 Hz, 62.4 kHz hsync */ { NULL, 70, 1152, 864, 10869, 106, 56, 20, 1, 160, 10, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 61 Hz, 64.2 kHz hsync */ { NULL, 61, 1280, 1024, 9090, 200, 48, 26, 1, 184, 3, 0, FB_VMODE_NONINTERLACED }, /* 1400x1050 @ 60Hz, 63.9 kHz hsync */ { NULL, 60, 1400, 1050, 9259, 136, 40, 13, 1, 112, 3, 0, FB_VMODE_NONINTERLACED }, /* 1400x1050 @ 75,107 Hz, 82,392 kHz +hsync +vsync*/ { NULL, 75, 1400, 1050, 7190, 120, 56, 23, 10, 112, 13, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1400x1050 @ 60 Hz, ? kHz +hsync +vsync*/ { NULL, 60, 1400, 1050, 9259, 128, 40, 12, 0, 112, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 85 Hz, 70.24 kHz hsync */ { NULL, 85, 1024, 768, 10111, 192, 32, 34, 14, 160, 6, 0, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 78 Hz, 70.8 kHz hsync */ { NULL, 78, 1152, 864, 9090, 228, 88, 32, 0, 84, 12, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 70 Hz, 74.59 kHz hsync */ { NULL, 70, 1280, 1024, 7905, 224, 32, 28, 8, 160, 8, 0, FB_VMODE_NONINTERLACED }, /* 1600x1200 @ 60Hz, 75.00 kHz hsync */ { NULL, 60, 1600, 1200, 6172, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 84 Hz, 76.0 kHz hsync */ { NULL, 84, 1152, 864, 7407, 184, 312, 32, 0, 128, 12, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 74 Hz, 78.85 kHz hsync */ { NULL, 74, 1280, 1024, 7407, 256, 32, 34, 3, 144, 3, 0, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 100Hz, 80.21 kHz hsync */ { NULL, 100, 1024, 768, 8658, 192, 32, 21, 3, 192, 10, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 76 Hz, 81.13 kHz hsync */ { NULL, 76, 1280, 1024, 7407, 248, 32, 34, 3, 104, 3, 0, FB_VMODE_NONINTERLACED }, /* 1600x1200 @ 70 Hz, 87.50 kHz hsync */ { NULL, 70, 1600, 1200, 5291, 304, 64, 46, 1, 192, 3, 0, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 100 Hz, 89.62 kHz hsync */ { NULL, 100, 1152, 864, 7264, 224, 32, 17, 2, 128, 19, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 85 Hz, 91.15 kHz hsync */ { NULL, 85, 1280, 1024, 6349, 224, 64, 44, 1, 160, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1600x1200 @ 75 Hz, 93.75 kHz hsync */ { NULL, 75, 1600, 1200, 4938, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1680x1050 @ 60 Hz, 65.191 kHz hsync */ { NULL, 60, 1680, 1050, 6848, 280, 104, 30, 3, 176, 6, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1600x1200 @ 85 Hz, 105.77 kHz hsync */ { NULL, 85, 1600, 1200, 4545, 272, 16, 37, 4, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 100 Hz, 107.16 kHz hsync */ { NULL, 100, 1280, 1024, 5502, 256, 32, 26, 7, 128, 15, 0, FB_VMODE_NONINTERLACED }, /* 1800x1440 @ 64Hz, 96.15 kHz hsync */ { NULL, 64, 1800, 1440, 4347, 304, 96, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1800x1440 @ 70Hz, 104.52 kHz hsync */ { NULL, 70, 1800, 1440, 4000, 304, 96, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 512x384 @ 78 Hz, 31.50 kHz hsync */ { NULL, 78, 512, 384, 49603, 48, 16, 16, 1, 64, 3, 0, FB_VMODE_NONINTERLACED }, /* 512x384 @ 85 Hz, 34.38 kHz hsync */ { NULL, 85, 512, 384, 45454, 48, 16, 16, 1, 64, 3, 0, FB_VMODE_NONINTERLACED }, /* 320x200 @ 70 Hz, 31.5 kHz hsync, 8:5 aspect ratio */ { NULL, 70, 320, 200, 79440, 16, 16, 20, 4, 48, 1, 0, FB_VMODE_DOUBLE }, /* 320x240 @ 60 Hz, 31.5 kHz hsync, 4:3 aspect ratio */ { NULL, 60, 320, 240, 79440, 16, 16, 16, 5, 48, 1, 0, FB_VMODE_DOUBLE }, /* 320x240 @ 72 Hz, 36.5 kHz hsync */ { NULL, 72, 320, 240, 63492, 16, 16, 16, 4, 48, 2, 0, FB_VMODE_DOUBLE }, /* 400x300 @ 56 Hz, 35.2 kHz hsync, 4:3 aspect ratio */ { NULL, 56, 400, 300, 55555, 64, 16, 10, 1, 32, 1, 0, FB_VMODE_DOUBLE }, /* 400x300 @ 60 Hz, 37.8 kHz hsync */ { NULL, 60, 400, 300, 50000, 48, 16, 11, 1, 64, 2, 0, FB_VMODE_DOUBLE }, /* 400x300 @ 72 Hz, 48.0 kHz hsync */ { NULL, 72, 400, 300, 40000, 32, 24, 11, 19, 64, 3, 0, FB_VMODE_DOUBLE }, /* 480x300 @ 56 Hz, 35.2 kHz hsync, 8:5 aspect ratio */ { NULL, 56, 480, 300, 46176, 80, 16, 10, 1, 40, 1, 0, FB_VMODE_DOUBLE }, /* 480x300 @ 60 Hz, 37.8 kHz hsync */ { NULL, 60, 480, 300, 41858, 56, 16, 11, 1, 80, 2, 0, FB_VMODE_DOUBLE }, /* 480x300 @ 63 Hz, 39.6 kHz hsync */ { NULL, 63, 480, 300, 40000, 56, 16, 11, 1, 80, 2, 0, FB_VMODE_DOUBLE }, /* 480x300 @ 72 Hz, 48.0 kHz hsync */ { NULL, 72, 480, 300, 33386, 40, 24, 11, 19, 80, 3, 0, FB_VMODE_DOUBLE }, /* 1920x1080 @ 60 Hz, 67.3 kHz hsync */ { NULL, 60, 1920, 1080, 6734, 148, 88, 36, 4, 44, 5, 0, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1920x1200 @ 60 Hz, 74.5 Khz hsync */ { NULL, 60, 1920, 1200, 5177, 128, 336, 1, 38, 208, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1152x768, 60 Hz, PowerBook G4 Titanium I and II */ { NULL, 60, 1152, 768, 14047, 158, 26, 29, 3, 136, 6, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1366x768, 60 Hz, 47.403 kHz hsync, WXGA 16:9 aspect ratio */ { NULL, 60, 1366, 768, 13806, 120, 10, 14, 3, 32, 5, 0, FB_VMODE_NONINTERLACED }, /* 1280x800, 60 Hz, 47.403 kHz hsync, WXGA 16:10 aspect ratio */ { NULL, 60, 1280, 800, 12048, 200, 64, 24, 1, 136, 3, 0, FB_VMODE_NONINTERLACED }, /* 720x576i @ 50 Hz, 15.625 kHz hsync (PAL RGB) */ { NULL, 50, 720, 576, 74074, 64, 16, 39, 5, 64, 5, 0, FB_VMODE_INTERLACED }, /* 800x520i @ 50 Hz, 15.625 kHz hsync (PAL RGB) */ { NULL, 50, 800, 520, 58823, 144, 64, 72, 28, 80, 5, 0, FB_VMODE_INTERLACED }, /* 864x480 @ 60 Hz, 35.15 kHz hsync */ { NULL, 60, 864, 480, 27777, 1, 1, 1, 1, 0, 0, 0, FB_VMODE_NONINTERLACED }, }; #ifdef CONFIG_FB_MODE_HELPERS const struct fb_videomode vesa_modes[] = { /* 0 640x350-85 VESA */ { NULL, 85, 640, 350, 31746, 96, 32, 60, 32, 64, 3, FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA}, /* 1 640x400-85 VESA */ { NULL, 85, 640, 400, 31746, 96, 32, 41, 01, 64, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 2 720x400-85 VESA */ { NULL, 85, 721, 400, 28169, 108, 36, 42, 01, 72, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 3 640x480-60 VESA */ { NULL, 60, 640, 480, 39682, 48, 16, 33, 10, 96, 2, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 4 640x480-72 VESA */ { NULL, 72, 640, 480, 31746, 128, 24, 29, 9, 40, 2, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 5 640x480-75 VESA */ { NULL, 75, 640, 480, 31746, 120, 16, 16, 01, 64, 3, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 6 640x480-85 VESA */ { NULL, 85, 640, 480, 27777, 80, 56, 25, 01, 56, 3, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 7 800x600-56 VESA */ { NULL, 56, 800, 600, 27777, 128, 24, 22, 01, 72, 2, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 8 800x600-60 VESA */ { NULL, 60, 800, 600, 25000, 88, 40, 23, 01, 128, 4, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 9 800x600-72 VESA */ { NULL, 72, 800, 600, 20000, 64, 56, 23, 37, 120, 6, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 10 800x600-75 VESA */ { NULL, 75, 800, 600, 20202, 160, 16, 21, 01, 80, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 11 800x600-85 VESA */ { NULL, 85, 800, 600, 17761, 152, 32, 27, 01, 64, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 12 1024x768i-43 VESA */ { NULL, 43, 1024, 768, 22271, 56, 8, 41, 0, 176, 8, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_INTERLACED, FB_MODE_IS_VESA }, /* 13 1024x768-60 VESA */ { NULL, 60, 1024, 768, 15384, 160, 24, 29, 3, 136, 6, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 14 1024x768-70 VESA */ { NULL, 70, 1024, 768, 13333, 144, 24, 29, 3, 136, 6, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 15 1024x768-75 VESA */ { NULL, 75, 1024, 768, 12690, 176, 16, 28, 1, 96, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 16 1024x768-85 VESA */ { NULL, 85, 1024, 768, 10582, 208, 48, 36, 1, 96, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 17 1152x864-75 VESA */ { NULL, 75, 1152, 864, 9259, 256, 64, 32, 1, 128, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 18 1280x960-60 VESA */ { NULL, 60, 1280, 960, 9259, 312, 96, 36, 1, 112, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 19 1280x960-85 VESA */ { NULL, 85, 1280, 960, 6734, 224, 64, 47, 1, 160, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 20 1280x1024-60 VESA */ { NULL, 60, 1280, 1024, 9259, 248, 48, 38, 1, 112, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 21 1280x1024-75 VESA */ { NULL, 75, 1280, 1024, 7407, 248, 16, 38, 1, 144, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 22 1280x1024-85 VESA */ { NULL, 85, 1280, 1024, 6349, 224, 64, 44, 1, 160, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 23 1600x1200-60 VESA */ { NULL, 60, 1600, 1200, 6172, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 24 1600x1200-65 VESA */ { NULL, 65, 1600, 1200, 5698, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 25 1600x1200-70 VESA */ { NULL, 70, 1600, 1200, 5291, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 26 1600x1200-75 VESA */ { NULL, 75, 1600, 1200, 4938, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 27 1600x1200-85 VESA */ { NULL, 85, 1600, 1200, 4357, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 28 1792x1344-60 VESA */ { NULL, 60, 1792, 1344, 4882, 328, 128, 46, 1, 200, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 29 1792x1344-75 VESA */ { NULL, 75, 1792, 1344, 3831, 352, 96, 69, 1, 216, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 30 1856x1392-60 VESA */ { NULL, 60, 1856, 1392, 4580, 352, 96, 43, 1, 224, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 31 1856x1392-75 VESA */ { NULL, 75, 1856, 1392, 3472, 352, 128, 104, 1, 224, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 32 1920x1440-60 VESA */ { NULL, 60, 1920, 1440, 4273, 344, 128, 56, 1, 200, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 33 1920x1440-75 VESA */ { NULL, 75, 1920, 1440, 3367, 352, 144, 56, 1, 224, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 34 1920x1200-60 RB VESA */ { NULL, 60, 1920, 1200, 6493, 80, 48, 26, 3, 32, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 35 1920x1200-60 VESA */ { NULL, 60, 1920, 1200, 5174, 336, 136, 36, 3, 200, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 36 1920x1200-75 VESA */ { NULL, 75, 1920, 1200, 4077, 344, 136, 46, 3, 208, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 37 1920x1200-85 VESA */ { NULL, 85, 1920, 1200, 3555, 352, 144, 53, 3, 208, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 38 2560x1600-60 RB VESA */ { NULL, 60, 2560, 1600, 3724, 80, 48, 37, 3, 32, 6, FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 39 2560x1600-60 VESA */ { NULL, 60, 2560, 1600, 2869, 472, 192, 49, 3, 280, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 40 2560x1600-75 VESA */ { NULL, 75, 2560, 1600, 2256, 488, 208, 63, 3, 280, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 41 2560x1600-85 VESA */ { NULL, 85, 2560, 1600, 1979, 488, 208, 73, 3, 280, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 42 2560x1600-120 RB VESA */ { NULL, 120, 2560, 1600, 1809, 80, 48, 85, 3, 32, 6, FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, }; EXPORT_SYMBOL(vesa_modes); const struct dmt_videomode dmt_modes[DMT_SIZE] = { { 0x01, 0x0000, 0x000000, &vesa_modes[0] }, { 0x02, 0x3119, 0x000000, &vesa_modes[1] }, { 0x03, 0x0000, 0x000000, &vesa_modes[2] }, { 0x04, 0x3140, 0x000000, &vesa_modes[3] }, { 0x05, 0x314c, 0x000000, &vesa_modes[4] }, { 0x06, 0x314f, 0x000000, &vesa_modes[5] }, { 0x07, 0x3159, 0x000000, &vesa_modes[6] }, { 0x08, 0x0000, 0x000000, &vesa_modes[7] }, { 0x09, 0x4540, 0x000000, &vesa_modes[8] }, { 0x0a, 0x454c, 0x000000, &vesa_modes[9] }, { 0x0b, 0x454f, 0x000000, &vesa_modes[10] }, { 0x0c, 0x4559, 0x000000, &vesa_modes[11] }, { 0x0d, 0x0000, 0x000000, NULL }, { 0x0e, 0x0000, 0x000000, NULL }, { 0x0f, 0x0000, 0x000000, &vesa_modes[12] }, { 0x10, 0x6140, 0x000000, &vesa_modes[13] }, { 0x11, 0x614a, 0x000000, &vesa_modes[14] }, { 0x12, 0x614f, 0x000000, &vesa_modes[15] }, { 0x13, 0x6159, 0x000000, &vesa_modes[16] }, { 0x14, 0x0000, 0x000000, NULL }, { 0x15, 0x714f, 0x000000, &vesa_modes[17] }, { 0x16, 0x0000, 0x7f1c21, NULL }, { 0x17, 0x0000, 0x7f1c28, NULL }, { 0x18, 0x0000, 0x7f1c44, NULL }, { 0x19, 0x0000, 0x7f1c62, NULL }, { 0x1a, 0x0000, 0x000000, NULL }, { 0x1b, 0x0000, 0x8f1821, NULL }, { 0x1c, 0x8100, 0x8f1828, NULL }, { 0x1d, 0x810f, 0x8f1844, NULL }, { 0x1e, 0x8119, 0x8f1862, NULL }, { 0x1f, 0x0000, 0x000000, NULL }, { 0x20, 0x8140, 0x000000, &vesa_modes[18] }, { 0x21, 0x8159, 0x000000, &vesa_modes[19] }, { 0x22, 0x0000, 0x000000, NULL }, { 0x23, 0x8180, 0x000000, &vesa_modes[20] }, { 0x24, 0x818f, 0x000000, &vesa_modes[21] }, { 0x25, 0x8199, 0x000000, &vesa_modes[22] }, { 0x26, 0x0000, 0x000000, NULL }, { 0x27, 0x0000, 0x000000, NULL }, { 0x28, 0x0000, 0x000000, NULL }, { 0x29, 0x0000, 0x0c2021, NULL }, { 0x2a, 0x9040, 0x0c2028, NULL }, { 0x2b, 0x904f, 0x0c2044, NULL }, { 0x2c, 0x9059, 0x0c2062, NULL }, { 0x2d, 0x0000, 0x000000, NULL }, { 0x2e, 0x9500, 0xc11821, NULL }, { 0x2f, 0x9500, 0xc11828, NULL }, { 0x30, 0x950f, 0xc11844, NULL }, { 0x31, 0x9519, 0xc11868, NULL }, { 0x32, 0x0000, 0x000000, NULL }, { 0x33, 0xa940, 0x000000, &vesa_modes[23] }, { 0x34, 0xa945, 0x000000, &vesa_modes[24] }, { 0x35, 0xa94a, 0x000000, &vesa_modes[25] }, { 0x36, 0xa94f, 0x000000, &vesa_modes[26] }, { 0x37, 0xa959, 0x000000, &vesa_modes[27] }, { 0x38, 0x0000, 0x000000, NULL }, { 0x39, 0x0000, 0x0c2821, NULL }, { 0x3a, 0xb300, 0x0c2828, NULL }, { 0x3b, 0xb30f, 0x0c2844, NULL }, { 0x3c, 0xb319, 0x0c2868, NULL }, { 0x3d, 0x0000, 0x000000, NULL }, { 0x3e, 0xc140, 0x000000, &vesa_modes[28] }, { 0x3f, 0xc14f, 0x000000, &vesa_modes[29] }, { 0x40, 0x0000, 0x000000, NULL}, { 0x41, 0xc940, 0x000000, &vesa_modes[30] }, { 0x42, 0xc94f, 0x000000, &vesa_modes[31] }, { 0x43, 0x0000, 0x000000, NULL }, { 0x44, 0x0000, 0x572821, &vesa_modes[34] }, { 0x45, 0xd100, 0x572828, &vesa_modes[35] }, { 0x46, 0xd10f, 0x572844, &vesa_modes[36] }, { 0x47, 0xd119, 0x572862, &vesa_modes[37] }, { 0x48, 0x0000, 0x000000, NULL }, { 0x49, 0xd140, 0x000000, &vesa_modes[32] }, { 0x4a, 0xd14f, 0x000000, &vesa_modes[33] }, { 0x4b, 0x0000, 0x000000, NULL }, { 0x4c, 0x0000, 0x1f3821, &vesa_modes[38] }, { 0x4d, 0x0000, 0x1f3828, &vesa_modes[39] }, { 0x4e, 0x0000, 0x1f3844, &vesa_modes[40] }, { 0x4f, 0x0000, 0x1f3862, &vesa_modes[41] }, { 0x50, 0x0000, 0x000000, &vesa_modes[42] }, }; EXPORT_SYMBOL(dmt_modes); #endif /* CONFIG_FB_MODE_HELPERS */ /** * fb_try_mode - test a video mode * @var: frame buffer user defined part of display * @info: frame buffer info structure * @mode: frame buffer video mode structure * @bpp: color depth in bits per pixel * * Tries a video mode to test it's validity for device @info. * * Returns 1 on success. * */ static int fb_try_mode(struct fb_var_screeninfo *var, struct fb_info *info, const struct fb_videomode *mode, unsigned int bpp) { int err = 0; DPRINTK("Trying mode %s %dx%d-%d@%d\n", mode->name ? mode->name : "noname", mode->xres, mode->yres, bpp, mode->refresh); var->xres = mode->xres; var->yres = mode->yres; var->xres_virtual = mode->xres; var->yres_virtual = mode->yres; var->xoffset = 0; var->yoffset = 0; var->bits_per_pixel = bpp; var->activate |= FB_ACTIVATE_TEST; var->pixclock = mode->pixclock; var->left_margin = mode->left_margin; var->right_margin = mode->right_margin; var->upper_margin = mode->upper_margin; var->lower_margin = mode->lower_margin; var->hsync_len = mode->hsync_len; var->vsync_len = mode->vsync_len; var->sync = mode->sync; var->vmode = mode->vmode; if (info->fbops->fb_check_var) err = info->fbops->fb_check_var(var, info); var->activate &= ~FB_ACTIVATE_TEST; return err; } /** * fb_find_mode - finds a valid video mode * @var: frame buffer user defined part of display * @info: frame buffer info structure * @mode_option: string video mode to find * @db: video mode database * @dbsize: size of @db * @default_mode: default video mode to fall back to * @default_bpp: default color depth in bits per pixel * * Finds a suitable video mode, starting with the specified mode * in @mode_option with fallback to @default_mode. If * @default_mode fails, all modes in the video mode database will * be tried. * * Valid mode specifiers for @mode_option:: * * <xres>x<yres>[M][R][-<bpp>][@<refresh>][i][p][m] * * or :: * * <name>[-<bpp>][@<refresh>] * * with <xres>, <yres>, <bpp> and <refresh> decimal numbers and * <name> a string. * * If 'M' is present after yres (and before refresh/bpp if present), * the function will compute the timings using VESA(tm) Coordinated * Video Timings (CVT). If 'R' is present after 'M', will compute with * reduced blanking (for flatpanels). If 'i' or 'p' are present, compute * interlaced or progressive mode. If 'm' is present, add margins equal * to 1.8% of xres rounded down to 8 pixels, and 1.8% of yres. The char * 'i', 'p' and 'm' must be after 'M' and 'R'. Example:: * * 1024x768MR-8@60m - Reduced blank with margins at 60Hz. * * NOTE: The passed struct @var is _not_ cleared! This allows you * to supply values for e.g. the grayscale and accel_flags fields. * * Returns zero for failure, 1 if using specified @mode_option, * 2 if using specified @mode_option with an ignored refresh rate, * 3 if default mode is used, 4 if fall back to any valid mode. */ int fb_find_mode(struct fb_var_screeninfo *var, struct fb_info *info, const char *mode_option, const struct fb_videomode *db, unsigned int dbsize, const struct fb_videomode *default_mode, unsigned int default_bpp) { char *mode_option_buf = NULL; int i; /* Set up defaults */ if (!db) { db = modedb; dbsize = ARRAY_SIZE(modedb); } if (!default_mode) default_mode = &db[0]; if (!default_bpp) default_bpp = 8; /* Did the user specify a video mode? */ if (!mode_option) { fb_get_options(NULL, &mode_option_buf); mode_option = mode_option_buf; } if (mode_option) { const char *name = mode_option; unsigned int namelen = strlen(name); int res_specified = 0, bpp_specified = 0, refresh_specified = 0; unsigned int xres = 0, yres = 0, bpp = default_bpp, refresh = 0; int yres_specified = 0, cvt = 0, rb = 0; int interlace_specified = 0, interlace = 0; int margins = 0; u32 best, diff, tdiff; for (i = namelen-1; i >= 0; i--) { switch (name[i]) { case '@': namelen = i; if (!refresh_specified && !bpp_specified && !yres_specified) { refresh = simple_strtol(&name[i+1], NULL, 10); refresh_specified = 1; if (cvt || rb) cvt = 0; } else goto done; break; case '-': namelen = i; if (!bpp_specified && !yres_specified) { bpp = simple_strtol(&name[i+1], NULL, 10); bpp_specified = 1; if (cvt || rb) cvt = 0; } else goto done; break; case 'x': if (!yres_specified) { yres = simple_strtol(&name[i+1], NULL, 10); yres_specified = 1; } else goto done; break; case '0' ... '9': break; case 'M': if (!yres_specified) cvt = 1; break; case 'R': if (!cvt) rb = 1; break; case 'm': if (!cvt) margins = 1; break; case 'p': if (!cvt) { interlace = 0; interlace_specified = 1; } break; case 'i': if (!cvt) { interlace = 1; interlace_specified = 1; } break; default: goto done; } } if (i < 0 && yres_specified) { xres = simple_strtol(name, NULL, 10); res_specified = 1; } done: kfree(mode_option_buf); if (cvt) { struct fb_videomode cvt_mode; int ret; DPRINTK("CVT mode %dx%d@%dHz%s%s%s\n", xres, yres, (refresh) ? refresh : 60, (rb) ? " reduced blanking" : "", (margins) ? " with margins" : "", (interlace) ? " interlaced" : ""); memset(&cvt_mode, 0, sizeof(cvt_mode)); cvt_mode.xres = xres; cvt_mode.yres = yres; cvt_mode.refresh = (refresh) ? refresh : 60; if (interlace) cvt_mode.vmode |= FB_VMODE_INTERLACED; else cvt_mode.vmode &= ~FB_VMODE_INTERLACED; ret = fb_find_mode_cvt(&cvt_mode, margins, rb); if (!ret && !fb_try_mode(var, info, &cvt_mode, bpp)) { DPRINTK("modedb CVT: CVT mode ok\n"); return 1; } DPRINTK("CVT mode invalid, getting mode from database\n"); } DPRINTK("Trying specified video mode%s %ix%i\n", refresh_specified ? "" : " (ignoring refresh rate)", xres, yres); if (!refresh_specified) { /* * If the caller has provided a custom mode database and * a valid monspecs structure, we look for the mode with * the highest refresh rate. Otherwise we play it safe * it and try to find a mode with a refresh rate closest * to the standard 60 Hz. */ if (db != modedb && info->monspecs.vfmin && info->monspecs.vfmax && info->monspecs.hfmin && info->monspecs.hfmax && info->monspecs.dclkmax) { refresh = 1000; } else { refresh = 60; } } diff = -1; best = -1; for (i = 0; i < dbsize; i++) { if ((name_matches(db[i], name, namelen) || (res_specified && res_matches(db[i], xres, yres))) && !fb_try_mode(var, info, &db[i], bpp)) { const int db_interlace = (db[i].vmode & FB_VMODE_INTERLACED ? 1 : 0); int score = abs(db[i].refresh - refresh); if (interlace_specified) score += abs(db_interlace - interlace); if (!interlace_specified || db_interlace == interlace) if (refresh_specified && db[i].refresh == refresh) return 1; if (score < diff) { diff = score; best = i; } } } if (best != -1) { fb_try_mode(var, info, &db[best], bpp); return (refresh_specified) ? 2 : 1; } diff = 2 * (xres + yres); best = -1; DPRINTK("Trying best-fit modes\n"); for (i = 0; i < dbsize; i++) { DPRINTK("Trying %ix%i\n", db[i].xres, db[i].yres); if (!fb_try_mode(var, info, &db[i], bpp)) { tdiff = abs(db[i].xres - xres) + abs(db[i].yres - yres); /* * Penalize modes with resolutions smaller * than requested. */ if (xres > db[i].xres || yres > db[i].yres) tdiff += xres + yres; if (diff > tdiff) { diff = tdiff; best = i; } } } if (best != -1) { fb_try_mode(var, info, &db[best], bpp); return 5; } } DPRINTK("Trying default video mode\n"); if (!fb_try_mode(var, info, default_mode, default_bpp)) return 3; DPRINTK("Trying all modes\n"); for (i = 0; i < dbsize; i++) if (!fb_try_mode(var, info, &db[i], default_bpp)) return 4; DPRINTK("No valid mode found\n"); return 0; } /** * fb_var_to_videomode - convert fb_var_screeninfo to fb_videomode * @mode: pointer to struct fb_videomode * @var: pointer to struct fb_var_screeninfo */ void fb_var_to_videomode(struct fb_videomode *mode, const struct fb_var_screeninfo *var) { u32 pixclock, hfreq, htotal, vtotal; mode->name = NULL; mode->xres = var->xres; mode->yres = var->yres; mode->pixclock = var->pixclock; mode->hsync_len = var->hsync_len; mode->vsync_len = var->vsync_len; mode->left_margin = var->left_margin; mode->right_margin = var->right_margin; mode->upper_margin = var->upper_margin; mode->lower_margin = var->lower_margin; mode->sync = var->sync; mode->vmode = var->vmode & FB_VMODE_MASK; mode->flag = FB_MODE_IS_FROM_VAR; mode->refresh = 0; if (!var->pixclock) return; pixclock = PICOS2KHZ(var->pixclock) * 1000; htotal = var->xres + var->right_margin + var->hsync_len + var->left_margin; vtotal = var->yres + var->lower_margin + var->vsync_len + var->upper_margin; if (var->vmode & FB_VMODE_INTERLACED) vtotal /= 2; if (var->vmode & FB_VMODE_DOUBLE) vtotal *= 2; if (!htotal || !vtotal) return; hfreq = pixclock/htotal; mode->refresh = hfreq/vtotal; } /** * fb_videomode_to_var - convert fb_videomode to fb_var_screeninfo * @var: pointer to struct fb_var_screeninfo * @mode: pointer to struct fb_videomode */ void fb_videomode_to_var(struct fb_var_screeninfo *var, const struct fb_videomode *mode) { var->xres = mode->xres; var->yres = mode->yres; var->xres_virtual = mode->xres; var->yres_virtual = mode->yres; var->xoffset = 0; var->yoffset = 0; var->pixclock = mode->pixclock; var->left_margin = mode->left_margin; var->right_margin = mode->right_margin; var->upper_margin = mode->upper_margin; var->lower_margin = mode->lower_margin; var->hsync_len = mode->hsync_len; var->vsync_len = mode->vsync_len; var->sync = mode->sync; var->vmode = mode->vmode & FB_VMODE_MASK; } /** * fb_mode_is_equal - compare 2 videomodes * @mode1: first videomode * @mode2: second videomode * * RETURNS: * 1 if equal, 0 if not */ int fb_mode_is_equal(const struct fb_videomode *mode1, const struct fb_videomode *mode2) { return (mode1->xres == mode2->xres && mode1->yres == mode2->yres && mode1->pixclock == mode2->pixclock && mode1->hsync_len == mode2->hsync_len && mode1->vsync_len == mode2->vsync_len && mode1->left_margin == mode2->left_margin && mode1->right_margin == mode2->right_margin && mode1->upper_margin == mode2->upper_margin && mode1->lower_margin == mode2->lower_margin && mode1->sync == mode2->sync && mode1->vmode == mode2->vmode); } /** * fb_find_best_mode - find best matching videomode * @var: pointer to struct fb_var_screeninfo * @head: pointer to struct list_head of modelist * * RETURNS: * struct fb_videomode, NULL if none found * * IMPORTANT: * This function assumes that all modelist entries in * info->modelist are valid. * * NOTES: * Finds best matching videomode which has an equal or greater dimension than * var->xres and var->yres. If more than 1 videomode is found, will return * the videomode with the highest refresh rate */ const struct fb_videomode *fb_find_best_mode(const struct fb_var_screeninfo *var, struct list_head *head) { struct fb_modelist *modelist; struct fb_videomode *mode, *best = NULL; u32 diff = -1; list_for_each_entry(modelist, head, list) { u32 d; mode = &modelist->mode; if (mode->xres >= var->xres && mode->yres >= var->yres) { d = (mode->xres - var->xres) + (mode->yres - var->yres); if (diff > d) { diff = d; best = mode; } else if (diff == d && best && mode->refresh > best->refresh) best = mode; } } return best; } /** * fb_find_nearest_mode - find closest videomode * * @mode: pointer to struct fb_videomode * @head: pointer to modelist * * Finds best matching videomode, smaller or greater in dimension. * If more than 1 videomode is found, will return the videomode with * the closest refresh rate. */ const struct fb_videomode *fb_find_nearest_mode(const struct fb_videomode *mode, struct list_head *head) { struct fb_modelist *modelist; struct fb_videomode *cmode, *best = NULL; u32 diff = -1, diff_refresh = -1; list_for_each_entry(modelist, head, list) { u32 d; cmode = &modelist->mode; d = abs(cmode->xres - mode->xres) + abs(cmode->yres - mode->yres); if (diff > d) { diff = d; diff_refresh = abs(cmode->refresh - mode->refresh); best = cmode; } else if (diff == d) { d = abs(cmode->refresh - mode->refresh); if (diff_refresh > d) { diff_refresh = d; best = cmode; } } } return best; } /** * fb_match_mode - find a videomode which exactly matches the timings in var * @var: pointer to struct fb_var_screeninfo * @head: pointer to struct list_head of modelist * * RETURNS: * struct fb_videomode, NULL if none found */ const struct fb_videomode *fb_match_mode(const struct fb_var_screeninfo *var, struct list_head *head) { struct fb_modelist *modelist; struct fb_videomode *m, mode; fb_var_to_videomode(&mode, var); list_for_each_entry(modelist, head, list) { m = &modelist->mode; if (fb_mode_is_equal(m, &mode)) return m; } return NULL; } /** * fb_add_videomode - adds videomode entry to modelist * @mode: videomode to add * @head: struct list_head of modelist * * NOTES: * Will only add unmatched mode entries */ int fb_add_videomode(const struct fb_videomode *mode, struct list_head *head) { struct fb_modelist *modelist; struct fb_videomode *m; int found = 0; list_for_each_entry(modelist, head, list) { m = &modelist->mode; if (fb_mode_is_equal(m, mode)) { found = 1; break; } } if (!found) { modelist = kmalloc(sizeof(struct fb_modelist), GFP_KERNEL); if (!modelist) return -ENOMEM; modelist->mode = *mode; list_add(&modelist->list, head); } return 0; } /** * fb_delete_videomode - removed videomode entry from modelist * @mode: videomode to remove * @head: struct list_head of modelist * * NOTES: * Will remove all matching mode entries */ void fb_delete_videomode(const struct fb_videomode *mode, struct list_head *head) { struct list_head *pos, *n; struct fb_modelist *modelist; struct fb_videomode *m; list_for_each_safe(pos, n, head) { modelist = list_entry(pos, struct fb_modelist, list); m = &modelist->mode; if (fb_mode_is_equal(m, mode)) { list_del(pos); kfree(pos); } } } /** * fb_destroy_modelist - destroy modelist * @head: struct list_head of modelist */ void fb_destroy_modelist(struct list_head *head) { struct list_head *pos, *n; list_for_each_safe(pos, n, head) { list_del(pos); kfree(pos); } } EXPORT_SYMBOL_GPL(fb_destroy_modelist); /** * fb_videomode_to_modelist - convert mode array to mode list * @modedb: array of struct fb_videomode * @num: number of entries in array * @head: struct list_head of modelist */ void fb_videomode_to_modelist(const struct fb_videomode *modedb, int num, struct list_head *head) { int i; INIT_LIST_HEAD(head); for (i = 0; i < num; i++) { if (fb_add_videomode(&modedb[i], head)) return; } } const struct fb_videomode *fb_find_best_display(const struct fb_monspecs *specs, struct list_head *head) { struct fb_modelist *modelist; const struct fb_videomode *m, *m1 = NULL, *md = NULL, *best = NULL; int first = 0; if (!head->prev || !head->next || list_empty(head)) goto finished; /* get the first detailed mode and the very first mode */ list_for_each_entry(modelist, head, list) { m = &modelist->mode; if (!first) { m1 = m; first = 1; } if (m->flag & FB_MODE_IS_FIRST) { md = m; break; } } /* first detailed timing is preferred */ if (specs->misc & FB_MISC_1ST_DETAIL) { best = md; goto finished; } /* find best mode based on display width and height */ if (specs->max_x && specs->max_y) { struct fb_var_screeninfo var; memset(&var, 0, sizeof(struct fb_var_screeninfo)); var.xres = (specs->max_x * 7200)/254; var.yres = (specs->max_y * 7200)/254; m = fb_find_best_mode(&var, head); if (m) { best = m; goto finished; } } /* use first detailed mode */ if (md) { best = md; goto finished; } /* last resort, use the very first mode */ best = m1; finished: return best; } EXPORT_SYMBOL(fb_find_best_display); EXPORT_SYMBOL(fb_videomode_to_var); EXPORT_SYMBOL(fb_var_to_videomode); EXPORT_SYMBOL(fb_mode_is_equal); EXPORT_SYMBOL(fb_add_videomode); EXPORT_SYMBOL(fb_match_mode); EXPORT_SYMBOL(fb_find_best_mode); EXPORT_SYMBOL(fb_find_nearest_mode); EXPORT_SYMBOL(fb_videomode_to_modelist); EXPORT_SYMBOL(fb_find_mode); EXPORT_SYMBOL(fb_find_mode_cvt);
5 5 5 5 5 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 // SPDX-License-Identifier: GPL-2.0-or-later /* * A generic kernel FIFO implementation * * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> */ #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kfifo.h> #include <linux/log2.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> /* * internal helper to calculate the unused elements in a fifo */ static inline unsigned int kfifo_unused(struct __kfifo *fifo) { return (fifo->mask + 1) - (fifo->in - fifo->out); } int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size, size_t esize, gfp_t gfp_mask, int node) { /* * round up to the next power of 2, since our 'let the indices * wrap' technique works only in this case. */ size = roundup_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; if (size < 2) { fifo->data = NULL; fifo->mask = 0; return -EINVAL; } fifo->data = kmalloc_array_node(esize, size, gfp_mask, node); if (!fifo->data) { fifo->mask = 0; return -ENOMEM; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_alloc_node); void __kfifo_free(struct __kfifo *fifo) { kfree(fifo->data); fifo->in = 0; fifo->out = 0; fifo->esize = 0; fifo->data = NULL; fifo->mask = 0; } EXPORT_SYMBOL(__kfifo_free); int __kfifo_init(struct __kfifo *fifo, void *buffer, unsigned int size, size_t esize) { size /= esize; if (!is_power_of_2(size)) size = rounddown_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; fifo->data = buffer; if (size < 2) { fifo->mask = 0; return -EINVAL; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_init); static void kfifo_copy_in(struct __kfifo *fifo, const void *src, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(fifo->data + off, src, l); memcpy(fifo->data, src + l, len - l); /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); } unsigned int __kfifo_in(struct __kfifo *fifo, const void *buf, unsigned int len) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; kfifo_copy_in(fifo, buf, len, fifo->in); fifo->in += len; return len; } EXPORT_SYMBOL(__kfifo_in); static void kfifo_copy_out(struct __kfifo *fifo, void *dst, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(dst, fifo->data + off, l); memcpy(dst + l, fifo->data, len - l); /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); } unsigned int __kfifo_out_peek(struct __kfifo *fifo, void *buf, unsigned int len) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; kfifo_copy_out(fifo, buf, len, fifo->out); return len; } EXPORT_SYMBOL(__kfifo_out_peek); unsigned int __kfifo_out_linear(struct __kfifo *fifo, unsigned int *tail, unsigned int n) { unsigned int size = fifo->mask + 1; unsigned int off = fifo->out & fifo->mask; if (tail) *tail = off; return min3(n, fifo->in - fifo->out, size - off); } EXPORT_SYMBOL(__kfifo_out_linear); unsigned int __kfifo_out(struct __kfifo *fifo, void *buf, unsigned int len) { len = __kfifo_out_peek(fifo, buf, len); fifo->out += len; return len; } EXPORT_SYMBOL(__kfifo_out); static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, const void __user *from, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; unsigned long ret; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_from_user(fifo->data + off, from, l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret + len - l, esize); else { ret = copy_from_user(fifo->data, from + l, len - l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret, esize); } /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, unsigned long len, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int esize = fifo->esize; int err; if (esize != 1) len /= esize; l = kfifo_unused(fifo); if (len > l) len = l; ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); if (unlikely(ret)) { len -= ret; err = -EFAULT; } else err = 0; fifo->in += len; return err; } EXPORT_SYMBOL(__kfifo_from_user); static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_to_user(to, fifo->data + off, l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret + len - l, esize); else { ret = copy_to_user(to + l, fifo->data, len - l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret, esize); } /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } int __kfifo_to_user(struct __kfifo *fifo, void __user *to, unsigned long len, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int esize = fifo->esize; int err; if (esize != 1) len /= esize; l = fifo->in - fifo->out; if (len > l) len = l; ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); if (unlikely(ret)) { len -= ret; err = -EFAULT; } else err = 0; fifo->out += len; return err; } EXPORT_SYMBOL(__kfifo_to_user); static unsigned int setup_sgl_buf(struct __kfifo *fifo, struct scatterlist *sgl, unsigned int data_offset, int nents, unsigned int len, dma_addr_t dma) { const void *buf = fifo->data + data_offset; if (!nents || !len) return 0; sg_set_buf(sgl, buf, len); if (dma != DMA_MAPPING_ERROR) { sg_dma_address(sgl) = dma + data_offset; sg_dma_len(sgl) = len; } return 1; } static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, unsigned int off, dma_addr_t dma) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int len_to_end; unsigned int n; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } len_to_end = min(len, size - off); n = setup_sgl_buf(fifo, sgl, off, nents, len_to_end, dma); n += setup_sgl_buf(fifo, sgl + n, 0, nents - n, len - len_to_end, dma); return n; } unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; return setup_sgl(fifo, sgl, nents, len, fifo->in, dma); } EXPORT_SYMBOL(__kfifo_dma_in_prepare); unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; return setup_sgl(fifo, sgl, nents, len, fifo->out, dma); } EXPORT_SYMBOL(__kfifo_dma_out_prepare); unsigned int __kfifo_max_r(unsigned int len, size_t recsize) { unsigned int max = (1 << (recsize << 3)) - 1; if (len > max) return max; return len; } EXPORT_SYMBOL(__kfifo_max_r); #define __KFIFO_PEEK(data, out, mask) \ ((data)[(out) & (mask)]) /* * __kfifo_peek_n internal helper function for determinate the length of * the next record in the fifo */ static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) { unsigned int l; unsigned int mask = fifo->mask; unsigned char *data = fifo->data; l = __KFIFO_PEEK(data, fifo->out, mask); if (--recsize) l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; return l; } #define __KFIFO_POKE(data, in, mask, val) \ ( \ (data)[(in) & (mask)] = (unsigned char)(val) \ ) /* * __kfifo_poke_n internal helper function for storing the length of * the record into the fifo */ static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) { unsigned int mask = fifo->mask; unsigned char *data = fifo->data; __KFIFO_POKE(data, fifo->in, mask, n); if (recsize > 1) __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); } unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) { return __kfifo_peek_n(fifo, recsize); } EXPORT_SYMBOL(__kfifo_len_r); unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, unsigned int len, size_t recsize) { if (len + recsize > kfifo_unused(fifo)) return 0; __kfifo_poke_n(fifo, len, recsize); kfifo_copy_in(fifo, buf, len, fifo->in + recsize); fifo->in += len + recsize; return len; } EXPORT_SYMBOL(__kfifo_in_r); static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize, unsigned int *n) { *n = __kfifo_peek_n(fifo, recsize); if (len > *n) len = *n; kfifo_copy_out(fifo, buf, len, fifo->out + recsize); return len; } unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize) { unsigned int n; if (fifo->in == fifo->out) return 0; return kfifo_out_copy_r(fifo, buf, len, recsize, &n); } EXPORT_SYMBOL(__kfifo_out_peek_r); unsigned int __kfifo_out_linear_r(struct __kfifo *fifo, unsigned int *tail, unsigned int n, size_t recsize) { if (fifo->in == fifo->out) return 0; if (tail) *tail = fifo->out + recsize; return min(n, __kfifo_peek_n(fifo, recsize)); } EXPORT_SYMBOL(__kfifo_out_linear_r); unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize) { unsigned int n; if (fifo->in == fifo->out) return 0; len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); fifo->out += n + recsize; return len; } EXPORT_SYMBOL(__kfifo_out_r); void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) { unsigned int n; n = __kfifo_peek_n(fifo, recsize); fifo->out += n + recsize; } EXPORT_SYMBOL(__kfifo_skip_r); int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, unsigned long len, unsigned int *copied, size_t recsize) { unsigned long ret; len = __kfifo_max_r(len, recsize); if (len + recsize > kfifo_unused(fifo)) { *copied = 0; return 0; } __kfifo_poke_n(fifo, len, recsize); ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); if (unlikely(ret)) { *copied = 0; return -EFAULT; } fifo->in += len + recsize; return 0; } EXPORT_SYMBOL(__kfifo_from_user_r); int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, unsigned long len, unsigned int *copied, size_t recsize) { unsigned long ret; unsigned int n; if (fifo->in == fifo->out) { *copied = 0; return 0; } n = __kfifo_peek_n(fifo, recsize); if (len > n) len = n; ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); if (unlikely(ret)) { *copied = 0; return -EFAULT; } fifo->out += n + recsize; return 0; } EXPORT_SYMBOL(__kfifo_to_user_r); unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize, dma_addr_t dma) { BUG_ON(!nents); len = __kfifo_max_r(len, recsize); if (len + recsize > kfifo_unused(fifo)) return 0; return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize, dma); } EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); void __kfifo_dma_in_finish_r(struct __kfifo *fifo, unsigned int len, size_t recsize) { len = __kfifo_max_r(len, recsize); __kfifo_poke_n(fifo, len, recsize); fifo->in += len + recsize; } EXPORT_SYMBOL(__kfifo_dma_in_finish_r); unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize, dma_addr_t dma) { BUG_ON(!nents); len = __kfifo_max_r(len, recsize); if (len + recsize > fifo->in - fifo->out) return 0; return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize, dma); } EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
19 19 17 16 17 17 18 18 17 18 18 18 18 2 2 2 2 18 18 18 18 18 18 18 18 17 18 12 17 18 16 16 16 11 11 1 1 1 2 2 2 1 1 1 2 25 1 19 19 19 19 19 19 19 18 18 18 18 18 17 17 17 12 11 11 11 11 11 11 4 4 4 11 11 10 11 11 4 4 4 16 17 17 16 17 17 17 17 17 17 17 19 19 18 16 7 4 19 19 15 4 4 4 4 4 4 4 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 9 9 8 7 1 6 1 5 6 6 6 6 1 1 6 5 5 8 5 2 3 5 2 1 4 1 3 3 2 3 3 3 3 2 2 1 1 1 1 1 1 1 1 1 1 2 17 17 17 1 17 17 12 11 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 /* * drm_irq.c IRQ and vblank support * * \author Rickard E. (Rik) Faith <faith@valinux.com> * \author Gareth Hughes <gareth@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/export.h> #include <linux/kthread.h> #include <linux/moduleparam.h> #include <drm/drm_crtc.h> #include <drm/drm_drv.h> #include <drm/drm_framebuffer.h> #include <drm/drm_managed.h> #include <drm/drm_modeset_helper_vtables.h> #include <drm/drm_print.h> #include <drm/drm_vblank.h> #include "drm_internal.h" #include "drm_trace.h" /** * DOC: vblank handling * * From the computer's perspective, every time the monitor displays * a new frame the scanout engine has "scanned out" the display image * from top to bottom, one row of pixels at a time. The current row * of pixels is referred to as the current scanline. * * In addition to the display's visible area, there's usually a couple of * extra scanlines which aren't actually displayed on the screen. * These extra scanlines don't contain image data and are occasionally used * for features like audio and infoframes. The region made up of these * scanlines is referred to as the vertical blanking region, or vblank for * short. * * For historical reference, the vertical blanking period was designed to * give the electron gun (on CRTs) enough time to move back to the top of * the screen to start scanning out the next frame. Similar for horizontal * blanking periods. They were designed to give the electron gun enough * time to move back to the other side of the screen to start scanning the * next scanline. * * :: * * * physical → ⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽ * top of | | * display | | * | New frame | * | | * |↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓| * |~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| ← Scanline, * |↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓| updates the * | | frame as it * | | travels down * | | ("scan out") * | Old frame | * | | * | | * | | * | | physical * | | bottom of * vertical |⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽| ← display * blanking ┆xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx┆ * region → ┆xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx┆ * ┆xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx┆ * start of → ⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽⎽ * new frame * * "Physical top of display" is the reference point for the high-precision/ * corrected timestamp. * * On a lot of display hardware, programming needs to take effect during the * vertical blanking period so that settings like gamma, the image buffer * buffer to be scanned out, etc. can safely be changed without showing * any visual artifacts on the screen. In some unforgiving hardware, some of * this programming has to both start and end in the same vblank. To help * with the timing of the hardware programming, an interrupt is usually * available to notify the driver when it can start the updating of registers. * The interrupt is in this context named the vblank interrupt. * * The vblank interrupt may be fired at different points depending on the * hardware. Some hardware implementations will fire the interrupt when the * new frame start, other implementations will fire the interrupt at different * points in time. * * Vertical blanking plays a major role in graphics rendering. To achieve * tear-free display, users must synchronize page flips and/or rendering to * vertical blanking. The DRM API offers ioctls to perform page flips * synchronized to vertical blanking and wait for vertical blanking. * * The DRM core handles most of the vertical blanking management logic, which * involves filtering out spurious interrupts, keeping race-free blanking * counters, coping with counter wrap-around and resets and keeping use counts. * It relies on the driver to generate vertical blanking interrupts and * optionally provide a hardware vertical blanking counter. * * Drivers must initialize the vertical blanking handling core with a call to * drm_vblank_init(). Minimally, a driver needs to implement * &drm_crtc_funcs.enable_vblank and &drm_crtc_funcs.disable_vblank plus call * drm_crtc_handle_vblank() in its vblank interrupt handler for working vblank * support. * * Vertical blanking interrupts can be enabled by the DRM core or by drivers * themselves (for instance to handle page flipping operations). The DRM core * maintains a vertical blanking use count to ensure that the interrupts are not * disabled while a user still needs them. To increment the use count, drivers * call drm_crtc_vblank_get() and release the vblank reference again with * drm_crtc_vblank_put(). In between these two calls vblank interrupts are * guaranteed to be enabled. * * On many hardware disabling the vblank interrupt cannot be done in a race-free * manner, see &drm_vblank_crtc_config.disable_immediate and * &drm_driver.max_vblank_count. In that case the vblank core only disables the * vblanks after a timer has expired, which can be configured through the * ``vblankoffdelay`` module parameter. * * Drivers for hardware without support for vertical-blanking interrupts can * use DRM vblank timers to send vblank events at the rate of the current * display mode's refresh. While not synchronized to the hardware's * vertical-blanking regions, the timer helps DRM clients and compositors to * adapt their update cycle to the display output. Drivers should set up * vblanking as usual, but call drm_crtc_vblank_start_timer() and * drm_crtc_vblank_cancel_timer() as part of their atomic mode setting. * See also DRM vblank helpers for more information. * * Drivers without support for vertical-blanking interrupts nor timers must * not call drm_vblank_init(). For these drivers, atomic helpers will * automatically generate fake vblank events as part of the display update. * This functionality also can be controlled by the driver by enabling and * disabling struct drm_crtc_state.no_vblank. */ /* Retry timestamp calculation up to 3 times to satisfy * drm_timestamp_precision before giving up. */ #define DRM_TIMESTAMP_MAXRETRIES 3 /* Threshold in nanoseconds for detection of redundant * vblank irq in drm_handle_vblank(). 1 msec should be ok. */ #define DRM_REDUNDANT_VBLIRQ_THRESH_NS 1000000 static bool drm_get_last_vbltimestamp(struct drm_device *dev, unsigned int pipe, ktime_t *tvblank, bool in_vblank_irq); static unsigned int drm_timestamp_precision = 20; /* Default to 20 usecs. */ static int drm_vblank_offdelay = 5000; /* Default to 5000 msecs. */ module_param_named(vblankoffdelay, drm_vblank_offdelay, int, 0600); module_param_named(timestamp_precision_usec, drm_timestamp_precision, int, 0600); MODULE_PARM_DESC(vblankoffdelay, "Delay until vblank irq auto-disable [msecs] (0: never disable, <0: disable immediately)"); MODULE_PARM_DESC(timestamp_precision_usec, "Max. error on timestamps [usecs]"); static struct drm_vblank_crtc * drm_vblank_crtc(struct drm_device *dev, unsigned int pipe) { return &dev->vblank[pipe]; } struct drm_vblank_crtc * drm_crtc_vblank_crtc(struct drm_crtc *crtc) { return drm_vblank_crtc(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_vblank_crtc); static void store_vblank(struct drm_device *dev, unsigned int pipe, u32 vblank_count_inc, ktime_t t_vblank, u32 last) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); assert_spin_locked(&dev->vblank_time_lock); vblank->last = last; write_seqlock(&vblank->seqlock); vblank->time = t_vblank; atomic64_add(vblank_count_inc, &vblank->count); write_sequnlock(&vblank->seqlock); } static u32 drm_max_vblank_count(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); return vblank->max_vblank_count ?: dev->max_vblank_count; } /* * "No hw counter" fallback implementation of .get_vblank_counter() hook, * if there is no usable hardware frame counter available. */ static u32 drm_vblank_no_hw_counter(struct drm_device *dev, unsigned int pipe) { drm_WARN_ON_ONCE(dev, drm_max_vblank_count(dev, pipe) != 0); return 0; } static u32 __get_vblank_counter(struct drm_device *dev, unsigned int pipe) { if (drm_core_check_feature(dev, DRIVER_MODESET)) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); if (drm_WARN_ON(dev, !crtc)) return 0; if (crtc->funcs->get_vblank_counter) return crtc->funcs->get_vblank_counter(crtc); } return drm_vblank_no_hw_counter(dev, pipe); } /* * Reset the stored timestamp for the current vblank count to correspond * to the last vblank occurred. * * Only to be called from drm_crtc_vblank_on(). * * Note: caller must hold &drm_device.vbl_lock since this reads & writes * device vblank fields. */ static void drm_reset_vblank_timestamp(struct drm_device *dev, unsigned int pipe) { u32 cur_vblank; bool rc; ktime_t t_vblank; int count = DRM_TIMESTAMP_MAXRETRIES; spin_lock(&dev->vblank_time_lock); /* * sample the current counter to avoid random jumps * when drm_vblank_enable() applies the diff */ do { cur_vblank = __get_vblank_counter(dev, pipe); rc = drm_get_last_vbltimestamp(dev, pipe, &t_vblank, false); } while (cur_vblank != __get_vblank_counter(dev, pipe) && --count > 0); /* * Only reinitialize corresponding vblank timestamp if high-precision query * available and didn't fail. Otherwise reinitialize delayed at next vblank * interrupt and assign 0 for now, to mark the vblanktimestamp as invalid. */ if (!rc) t_vblank = 0; /* * +1 to make sure user will never see the same * vblank counter value before and after a modeset */ store_vblank(dev, pipe, 1, t_vblank, cur_vblank); spin_unlock(&dev->vblank_time_lock); } /* * Call back into the driver to update the appropriate vblank counter * (specified by @pipe). Deal with wraparound, if it occurred, and * update the last read value so we can deal with wraparound on the next * call if necessary. * * Only necessary when going from off->on, to account for frames we * didn't get an interrupt for. * * Note: caller must hold &drm_device.vbl_lock since this reads & writes * device vblank fields. */ static void drm_update_vblank_count(struct drm_device *dev, unsigned int pipe, bool in_vblank_irq) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); u32 cur_vblank, diff; bool rc; ktime_t t_vblank; int count = DRM_TIMESTAMP_MAXRETRIES; int framedur_ns = vblank->framedur_ns; u32 max_vblank_count = drm_max_vblank_count(dev, pipe); /* * Interrupts were disabled prior to this call, so deal with counter * wrap if needed. * NOTE! It's possible we lost a full dev->max_vblank_count + 1 events * here if the register is small or we had vblank interrupts off for * a long time. * * We repeat the hardware vblank counter & timestamp query until * we get consistent results. This to prevent races between gpu * updating its hardware counter while we are retrieving the * corresponding vblank timestamp. */ do { cur_vblank = __get_vblank_counter(dev, pipe); rc = drm_get_last_vbltimestamp(dev, pipe, &t_vblank, in_vblank_irq); } while (cur_vblank != __get_vblank_counter(dev, pipe) && --count > 0); if (max_vblank_count) { /* trust the hw counter when it's around */ diff = (cur_vblank - vblank->last) & max_vblank_count; } else if (rc && framedur_ns) { u64 diff_ns = ktime_to_ns(ktime_sub(t_vblank, vblank->time)); /* * Figure out how many vblanks we've missed based * on the difference in the timestamps and the * frame/field duration. */ drm_dbg_vbl(dev, "crtc %u: Calculating number of vblanks." " diff_ns = %lld, framedur_ns = %d)\n", pipe, (long long)diff_ns, framedur_ns); diff = DIV_ROUND_CLOSEST_ULL(diff_ns, framedur_ns); if (diff == 0 && in_vblank_irq) drm_dbg_vbl(dev, "crtc %u: Redundant vblirq ignored\n", pipe); } else { /* some kind of default for drivers w/o accurate vbl timestamping */ diff = in_vblank_irq ? 1 : 0; } /* * Within a drm_vblank_pre_modeset - drm_vblank_post_modeset * interval? If so then vblank irqs keep running and it will likely * happen that the hardware vblank counter is not trustworthy as it * might reset at some point in that interval and vblank timestamps * are not trustworthy either in that interval. Iow. this can result * in a bogus diff >> 1 which must be avoided as it would cause * random large forward jumps of the software vblank counter. */ if (diff > 1 && (vblank->inmodeset & 0x2)) { drm_dbg_vbl(dev, "clamping vblank bump to 1 on crtc %u: diffr=%u" " due to pre-modeset.\n", pipe, diff); diff = 1; } drm_dbg_vbl(dev, "updating vblank count on crtc %u:" " current=%llu, diff=%u, hw=%u hw_last=%u\n", pipe, (unsigned long long)atomic64_read(&vblank->count), diff, cur_vblank, vblank->last); if (diff == 0) { drm_WARN_ON_ONCE(dev, cur_vblank != vblank->last); return; } /* * Only reinitialize corresponding vblank timestamp if high-precision query * available and didn't fail, or we were called from the vblank interrupt. * Otherwise reinitialize delayed at next vblank interrupt and assign 0 * for now, to mark the vblanktimestamp as invalid. */ if (!rc && !in_vblank_irq) t_vblank = 0; store_vblank(dev, pipe, diff, t_vblank, cur_vblank); } u64 drm_vblank_count(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); u64 count; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return 0; count = atomic64_read(&vblank->count); /* * This read barrier corresponds to the implicit write barrier of the * write seqlock in store_vblank(). Note that this is the only place * where we need an explicit barrier, since all other access goes * through drm_vblank_count_and_time(), which already has the required * read barrier curtesy of the read seqlock. */ smp_rmb(); return count; } /** * drm_crtc_accurate_vblank_count - retrieve the master vblank counter * @crtc: which counter to retrieve * * This function is similar to drm_crtc_vblank_count() but this function * interpolates to handle a race with vblank interrupts using the high precision * timestamping support. * * This is mostly useful for hardware that can obtain the scanout position, but * doesn't have a hardware frame counter. */ u64 drm_crtc_accurate_vblank_count(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); u64 vblank; unsigned long flags; drm_WARN_ONCE(dev, drm_debug_enabled(DRM_UT_VBL) && !crtc->funcs->get_vblank_timestamp, "This function requires support for accurate vblank timestamps."); spin_lock_irqsave(&dev->vblank_time_lock, flags); drm_update_vblank_count(dev, pipe, false); vblank = drm_vblank_count(dev, pipe); spin_unlock_irqrestore(&dev->vblank_time_lock, flags); return vblank; } EXPORT_SYMBOL(drm_crtc_accurate_vblank_count); static void __disable_vblank(struct drm_device *dev, unsigned int pipe) { if (drm_core_check_feature(dev, DRIVER_MODESET)) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); if (drm_WARN_ON(dev, !crtc)) return; if (crtc->funcs->disable_vblank) crtc->funcs->disable_vblank(crtc); } } /* * Disable vblank irq's on crtc, make sure that last vblank count * of hardware and corresponding consistent software vblank counter * are preserved, even if there are any spurious vblank irq's after * disable. */ void drm_vblank_disable_and_save(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); unsigned long irqflags; assert_spin_locked(&dev->vbl_lock); /* Prevent vblank irq processing while disabling vblank irqs, * so no updates of timestamps or count can happen after we've * disabled. Needed to prevent races in case of delayed irq's. */ spin_lock_irqsave(&dev->vblank_time_lock, irqflags); /* * Update vblank count and disable vblank interrupts only if the * interrupts were enabled. This avoids calling the ->disable_vblank() * operation in atomic context with the hardware potentially runtime * suspended. */ if (!vblank->enabled) goto out; /* * Update the count and timestamp to maintain the * appearance that the counter has been ticking all along until * this time. This makes the count account for the entire time * between drm_crtc_vblank_on() and drm_crtc_vblank_off(). */ drm_update_vblank_count(dev, pipe, false); __disable_vblank(dev, pipe); vblank->enabled = false; out: spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags); } static void vblank_disable_fn(struct timer_list *t) { struct drm_vblank_crtc *vblank = timer_container_of(vblank, t, disable_timer); struct drm_device *dev = vblank->dev; unsigned int pipe = vblank->pipe; unsigned long irqflags; spin_lock_irqsave(&dev->vbl_lock, irqflags); if (atomic_read(&vblank->refcount) == 0 && vblank->enabled) { drm_dbg_core(dev, "disabling vblank on crtc %u\n", pipe); drm_vblank_disable_and_save(dev, pipe); } spin_unlock_irqrestore(&dev->vbl_lock, irqflags); } static void drm_vblank_init_release(struct drm_device *dev, void *ptr) { struct drm_vblank_crtc *vblank = ptr; drm_WARN_ON(dev, READ_ONCE(vblank->enabled) && drm_core_check_feature(dev, DRIVER_MODESET)); if (vblank->vblank_timer.crtc) hrtimer_cancel(&vblank->vblank_timer.timer); drm_vblank_destroy_worker(vblank); timer_delete_sync(&vblank->disable_timer); } /** * drm_vblank_init - initialize vblank support * @dev: DRM device * @num_crtcs: number of CRTCs supported by @dev * * This function initializes vblank support for @num_crtcs display pipelines. * Cleanup is handled automatically through a cleanup function added with * drmm_add_action_or_reset(). * * Returns: * Zero on success or a negative error code on failure. */ int drm_vblank_init(struct drm_device *dev, unsigned int num_crtcs) { int ret; unsigned int i; spin_lock_init(&dev->vbl_lock); spin_lock_init(&dev->vblank_time_lock); dev->vblank = drmm_kcalloc(dev, num_crtcs, sizeof(*dev->vblank), GFP_KERNEL); if (!dev->vblank) return -ENOMEM; dev->num_crtcs = num_crtcs; for (i = 0; i < num_crtcs; i++) { struct drm_vblank_crtc *vblank = &dev->vblank[i]; vblank->dev = dev; vblank->pipe = i; init_waitqueue_head(&vblank->queue); timer_setup(&vblank->disable_timer, vblank_disable_fn, 0); seqlock_init(&vblank->seqlock); ret = drmm_add_action_or_reset(dev, drm_vblank_init_release, vblank); if (ret) return ret; ret = drm_vblank_worker_init(vblank); if (ret) return ret; } return 0; } EXPORT_SYMBOL(drm_vblank_init); /** * drm_dev_has_vblank - test if vblanking has been initialized for * a device * @dev: the device * * Drivers may call this function to test if vblank support is * initialized for a device. For most hardware this means that vblanking * can also be enabled. * * Atomic helpers use this function to initialize * &drm_crtc_state.no_vblank. See also drm_atomic_helper_check_modeset(). * * Returns: * True if vblanking has been initialized for the given device, false * otherwise. */ bool drm_dev_has_vblank(const struct drm_device *dev) { return dev->num_crtcs != 0; } EXPORT_SYMBOL(drm_dev_has_vblank); /** * drm_crtc_vblank_waitqueue - get vblank waitqueue for the CRTC * @crtc: which CRTC's vblank waitqueue to retrieve * * This function returns a pointer to the vblank waitqueue for the CRTC. * Drivers can use this to implement vblank waits using wait_event() and related * functions. */ wait_queue_head_t *drm_crtc_vblank_waitqueue(struct drm_crtc *crtc) { return &crtc->dev->vblank[drm_crtc_index(crtc)].queue; } EXPORT_SYMBOL(drm_crtc_vblank_waitqueue); /** * drm_calc_timestamping_constants - calculate vblank timestamp constants * @crtc: drm_crtc whose timestamp constants should be updated. * @mode: display mode containing the scanout timings * * Calculate and store various constants which are later needed by vblank and * swap-completion timestamping, e.g, by * drm_crtc_vblank_helper_get_vblank_timestamp(). They are derived from * CRTC's true scanout timing, so they take things like panel scaling or * other adjustments into account. */ void drm_calc_timestamping_constants(struct drm_crtc *crtc, const struct drm_display_mode *mode) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); int linedur_ns = 0, framedur_ns = 0; int dotclock = mode->crtc_clock; if (!drm_dev_has_vblank(dev)) return; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; /* Valid dotclock? */ if (dotclock > 0) { int frame_size = mode->crtc_htotal * mode->crtc_vtotal; /* * Convert scanline length in pixels and video * dot clock to line duration and frame duration * in nanoseconds: */ linedur_ns = div_u64((u64) mode->crtc_htotal * 1000000, dotclock); framedur_ns = div_u64((u64) frame_size * 1000000, dotclock); /* * Fields of interlaced scanout modes are only half a frame duration. */ if (mode->flags & DRM_MODE_FLAG_INTERLACE) framedur_ns /= 2; } else { drm_err(dev, "crtc %u: Can't calculate constants, dotclock = 0!\n", crtc->base.id); } vblank->linedur_ns = linedur_ns; vblank->framedur_ns = framedur_ns; drm_mode_copy(&vblank->hwmode, mode); drm_dbg_core(dev, "crtc %u: hwmode: htotal %d, vtotal %d, vdisplay %d\n", crtc->base.id, mode->crtc_htotal, mode->crtc_vtotal, mode->crtc_vdisplay); drm_dbg_core(dev, "crtc %u: clock %d kHz framedur %d linedur %d\n", crtc->base.id, dotclock, framedur_ns, linedur_ns); } EXPORT_SYMBOL(drm_calc_timestamping_constants); /** * drm_crtc_vblank_helper_get_vblank_timestamp_internal - precise vblank * timestamp helper * @crtc: CRTC whose vblank timestamp to retrieve * @max_error: Desired maximum allowable error in timestamps (nanosecs) * On return contains true maximum error of timestamp * @vblank_time: Pointer to time which should receive the timestamp * @in_vblank_irq: * True when called from drm_crtc_handle_vblank(). Some drivers * need to apply some workarounds for gpu-specific vblank irq quirks * if flag is set. * @get_scanout_position: * Callback function to retrieve the scanout position. See * @struct drm_crtc_helper_funcs.get_scanout_position. * * Implements calculation of exact vblank timestamps from given drm_display_mode * timings and current video scanout position of a CRTC. * * The current implementation only handles standard video modes. For double scan * and interlaced modes the driver is supposed to adjust the hardware mode * (taken from &drm_crtc_state.adjusted mode for atomic modeset drivers) to * match the scanout position reported. * * Note that atomic drivers must call drm_calc_timestamping_constants() before * enabling a CRTC. The atomic helpers already take care of that in * drm_atomic_helper_calc_timestamping_constants(). * * Returns: * Returns true on success, and false on failure, i.e. when no accurate * timestamp could be acquired. */ bool drm_crtc_vblank_helper_get_vblank_timestamp_internal( struct drm_crtc *crtc, int *max_error, ktime_t *vblank_time, bool in_vblank_irq, drm_vblank_get_scanout_position_func get_scanout_position) { struct drm_device *dev = crtc->dev; unsigned int pipe = crtc->index; struct drm_vblank_crtc *vblank = &dev->vblank[pipe]; struct timespec64 ts_etime, ts_vblank_time; ktime_t stime, etime; bool vbl_status; const struct drm_display_mode *mode; int vpos, hpos, i; int delta_ns, duration_ns; if (pipe >= dev->num_crtcs) { drm_err(dev, "Invalid crtc %u\n", pipe); return false; } /* Scanout position query not supported? Should not happen. */ if (!get_scanout_position) { drm_err(dev, "Called from CRTC w/o get_scanout_position()!?\n"); return false; } if (drm_drv_uses_atomic_modeset(dev)) mode = &vblank->hwmode; else mode = &crtc->hwmode; /* If mode timing undefined, just return as no-op: * Happens during initial modesetting of a crtc. */ if (mode->crtc_clock == 0) { drm_dbg_core(dev, "crtc %u: Noop due to uninitialized mode.\n", pipe); drm_WARN_ON_ONCE(dev, drm_drv_uses_atomic_modeset(dev)); return false; } /* Get current scanout position with system timestamp. * Repeat query up to DRM_TIMESTAMP_MAXRETRIES times * if single query takes longer than max_error nanoseconds. * * This guarantees a tight bound on maximum error if * code gets preempted or delayed for some reason. */ for (i = 0; i < DRM_TIMESTAMP_MAXRETRIES; i++) { /* * Get vertical and horizontal scanout position vpos, hpos, * and bounding timestamps stime, etime, pre/post query. */ vbl_status = get_scanout_position(crtc, in_vblank_irq, &vpos, &hpos, &stime, &etime, mode); /* Return as no-op if scanout query unsupported or failed. */ if (!vbl_status) { drm_dbg_core(dev, "crtc %u : scanoutpos query failed.\n", pipe); return false; } /* Compute uncertainty in timestamp of scanout position query. */ duration_ns = ktime_to_ns(etime) - ktime_to_ns(stime); /* Accept result with < max_error nsecs timing uncertainty. */ if (duration_ns <= *max_error) break; } /* Noisy system timing? */ if (i == DRM_TIMESTAMP_MAXRETRIES) { drm_dbg_core(dev, "crtc %u: Noisy timestamp %d us > %d us [%d reps].\n", pipe, duration_ns / 1000, *max_error / 1000, i); } /* Return upper bound of timestamp precision error. */ *max_error = duration_ns; /* Convert scanout position into elapsed time at raw_time query * since start of scanout at first display scanline. delta_ns * can be negative if start of scanout hasn't happened yet. */ delta_ns = div_s64(1000000LL * (vpos * mode->crtc_htotal + hpos), mode->crtc_clock); /* Subtract time delta from raw timestamp to get final * vblank_time timestamp for end of vblank. */ *vblank_time = ktime_sub_ns(etime, delta_ns); if (!drm_debug_enabled(DRM_UT_VBL)) return true; ts_etime = ktime_to_timespec64(etime); ts_vblank_time = ktime_to_timespec64(*vblank_time); drm_dbg_vbl(dev, "crtc %u : v p(%d,%d)@ %ptSp -> %ptSp [e %d us, %d rep]\n", pipe, hpos, vpos, &ts_etime, &ts_vblank_time, duration_ns / 1000, i); return true; } EXPORT_SYMBOL(drm_crtc_vblank_helper_get_vblank_timestamp_internal); /** * drm_crtc_vblank_helper_get_vblank_timestamp - precise vblank timestamp * helper * @crtc: CRTC whose vblank timestamp to retrieve * @max_error: Desired maximum allowable error in timestamps (nanosecs) * On return contains true maximum error of timestamp * @vblank_time: Pointer to time which should receive the timestamp * @in_vblank_irq: * True when called from drm_crtc_handle_vblank(). Some drivers * need to apply some workarounds for gpu-specific vblank irq quirks * if flag is set. * * Implements calculation of exact vblank timestamps from given drm_display_mode * timings and current video scanout position of a CRTC. This can be directly * used as the &drm_crtc_funcs.get_vblank_timestamp implementation of a kms * driver if &drm_crtc_helper_funcs.get_scanout_position is implemented. * * The current implementation only handles standard video modes. For double scan * and interlaced modes the driver is supposed to adjust the hardware mode * (taken from &drm_crtc_state.adjusted mode for atomic modeset drivers) to * match the scanout position reported. * * Note that atomic drivers must call drm_calc_timestamping_constants() before * enabling a CRTC. The atomic helpers already take care of that in * drm_atomic_helper_calc_timestamping_constants(). * * Returns: * Returns true on success, and false on failure, i.e. when no accurate * timestamp could be acquired. */ bool drm_crtc_vblank_helper_get_vblank_timestamp(struct drm_crtc *crtc, int *max_error, ktime_t *vblank_time, bool in_vblank_irq) { return drm_crtc_vblank_helper_get_vblank_timestamp_internal( crtc, max_error, vblank_time, in_vblank_irq, crtc->helper_private->get_scanout_position); } EXPORT_SYMBOL(drm_crtc_vblank_helper_get_vblank_timestamp); /** * drm_crtc_get_last_vbltimestamp - retrieve raw timestamp for the most * recent vblank interval * @crtc: CRTC whose vblank timestamp to retrieve * @tvblank: Pointer to target time which should receive the timestamp * @in_vblank_irq: * True when called from drm_crtc_handle_vblank(). Some drivers * need to apply some workarounds for gpu-specific vblank irq quirks * if flag is set. * * Fetches the system timestamp corresponding to the time of the most recent * vblank interval on specified CRTC. May call into kms-driver to * compute the timestamp with a high-precision GPU specific method. * * Returns zero if timestamp originates from uncorrected do_gettimeofday() * call, i.e., it isn't very precisely locked to the true vblank. * * Returns: * True if timestamp is considered to be very precise, false otherwise. */ static bool drm_crtc_get_last_vbltimestamp(struct drm_crtc *crtc, ktime_t *tvblank, bool in_vblank_irq) { bool ret = false; /* Define requested maximum error on timestamps (nanoseconds). */ int max_error = (int) drm_timestamp_precision * 1000; /* Query driver if possible and precision timestamping enabled. */ if (crtc && crtc->funcs->get_vblank_timestamp && max_error > 0) { ret = crtc->funcs->get_vblank_timestamp(crtc, &max_error, tvblank, in_vblank_irq); } /* GPU high precision timestamp query unsupported or failed. * Return current monotonic/gettimeofday timestamp as best estimate. */ if (!ret) *tvblank = ktime_get(); return ret; } static bool drm_get_last_vbltimestamp(struct drm_device *dev, unsigned int pipe, ktime_t *tvblank, bool in_vblank_irq) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); return drm_crtc_get_last_vbltimestamp(crtc, tvblank, in_vblank_irq); } /** * drm_crtc_vblank_count - retrieve "cooked" vblank counter value * @crtc: which counter to retrieve * * Fetches the "cooked" vblank count value that represents the number of * vblank events since the system was booted, including lost events due to * modesetting activity. Note that this timer isn't correct against a racing * vblank interrupt (since it only reports the software vblank counter), see * drm_crtc_accurate_vblank_count() for such use-cases. * * Note that for a given vblank counter value drm_crtc_handle_vblank() * and drm_crtc_vblank_count() or drm_crtc_vblank_count_and_time() * provide a barrier: Any writes done before calling * drm_crtc_handle_vblank() will be visible to callers of the later * functions, if the vblank count is the same or a later one. * * See also &drm_vblank_crtc.count. * * Returns: * The software vblank counter. */ u64 drm_crtc_vblank_count(struct drm_crtc *crtc) { return drm_vblank_count(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_vblank_count); /** * drm_vblank_count_and_time - retrieve "cooked" vblank counter value and the * system timestamp corresponding to that vblank counter value. * @dev: DRM device * @pipe: index of CRTC whose counter to retrieve * @vblanktime: Pointer to ktime_t to receive the vblank timestamp. * * Fetches the "cooked" vblank count value that represents the number of * vblank events since the system was booted, including lost events due to * modesetting activity. Returns corresponding system timestamp of the time * of the vblank interval that corresponds to the current vblank counter value. * * This is the legacy version of drm_crtc_vblank_count_and_time(). */ static u64 drm_vblank_count_and_time(struct drm_device *dev, unsigned int pipe, ktime_t *vblanktime) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); u64 vblank_count; unsigned int seq; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) { *vblanktime = 0; return 0; } do { seq = read_seqbegin(&vblank->seqlock); vblank_count = atomic64_read(&vblank->count); *vblanktime = vblank->time; } while (read_seqretry(&vblank->seqlock, seq)); return vblank_count; } /** * drm_crtc_vblank_count_and_time - retrieve "cooked" vblank counter value * and the system timestamp corresponding to that vblank counter value * @crtc: which counter to retrieve * @vblanktime: Pointer to time to receive the vblank timestamp. * * Fetches the "cooked" vblank count value that represents the number of * vblank events since the system was booted, including lost events due to * modesetting activity. Returns corresponding system timestamp of the time * of the vblank interval that corresponds to the current vblank counter value. * * Note that for a given vblank counter value drm_crtc_handle_vblank() * and drm_crtc_vblank_count() or drm_crtc_vblank_count_and_time() * provide a barrier: Any writes done before calling * drm_crtc_handle_vblank() will be visible to callers of the later * functions, if the vblank count is the same or a later one. * * See also &drm_vblank_crtc.count. */ u64 drm_crtc_vblank_count_and_time(struct drm_crtc *crtc, ktime_t *vblanktime) { return drm_vblank_count_and_time(crtc->dev, drm_crtc_index(crtc), vblanktime); } EXPORT_SYMBOL(drm_crtc_vblank_count_and_time); /** * drm_crtc_next_vblank_start - calculate the time of the next vblank * @crtc: the crtc for which to calculate next vblank time * @vblanktime: pointer to time to receive the next vblank timestamp. * * Calculate the expected time of the start of the next vblank period, * based on time of previous vblank and frame duration */ int drm_crtc_next_vblank_start(struct drm_crtc *crtc, ktime_t *vblanktime) { struct drm_vblank_crtc *vblank; struct drm_display_mode *mode; u64 vblank_start; if (!drm_dev_has_vblank(crtc->dev)) return -EINVAL; vblank = drm_crtc_vblank_crtc(crtc); mode = &vblank->hwmode; if (!vblank->framedur_ns || !vblank->linedur_ns) return -EINVAL; if (!drm_crtc_get_last_vbltimestamp(crtc, vblanktime, false)) return -EINVAL; vblank_start = DIV_ROUND_DOWN_ULL( (u64)vblank->framedur_ns * mode->crtc_vblank_start, mode->crtc_vtotal); *vblanktime = ktime_add(*vblanktime, ns_to_ktime(vblank_start)); return 0; } EXPORT_SYMBOL(drm_crtc_next_vblank_start); static void send_vblank_event(struct drm_device *dev, struct drm_pending_vblank_event *e, u64 seq, ktime_t now) { struct timespec64 tv; switch (e->event.base.type) { case DRM_EVENT_VBLANK: case DRM_EVENT_FLIP_COMPLETE: tv = ktime_to_timespec64(now); e->event.vbl.sequence = seq; /* * e->event is a user space structure, with hardcoded unsigned * 32-bit seconds/microseconds. This is safe as we always use * monotonic timestamps since linux-4.15 */ e->event.vbl.tv_sec = tv.tv_sec; e->event.vbl.tv_usec = tv.tv_nsec / 1000; break; case DRM_EVENT_CRTC_SEQUENCE: if (seq) e->event.seq.sequence = seq; e->event.seq.time_ns = ktime_to_ns(now); break; } trace_drm_vblank_event_delivered(e->base.file_priv, e->pipe, seq); /* * Use the same timestamp for any associated fence signal to avoid * mismatch in timestamps for vsync & fence events triggered by the * same HW event. Frameworks like SurfaceFlinger in Android expects the * retire-fence timestamp to match exactly with HW vsync as it uses it * for its software vsync modeling. */ drm_send_event_timestamp_locked(dev, &e->base, now); } /** * drm_crtc_arm_vblank_event - arm vblank event after pageflip * @crtc: the source CRTC of the vblank event * @e: the event to send * * A lot of drivers need to generate vblank events for the very next vblank * interrupt. For example when the page flip interrupt happens when the page * flip gets armed, but not when it actually executes within the next vblank * period. This helper function implements exactly the required vblank arming * behaviour. * * NOTE: Drivers using this to send out the &drm_crtc_state.event as part of an * atomic commit must ensure that the next vblank happens at exactly the same * time as the atomic commit is committed to the hardware. This function itself * does **not** protect against the next vblank interrupt racing with either this * function call or the atomic commit operation. A possible sequence could be: * * 1. Driver commits new hardware state into vblank-synchronized registers. * 2. A vblank happens, committing the hardware state. Also the corresponding * vblank interrupt is fired off and fully processed by the interrupt * handler. * 3. The atomic commit operation proceeds to call drm_crtc_arm_vblank_event(). * 4. The event is only send out for the next vblank, which is wrong. * * An equivalent race can happen when the driver calls * drm_crtc_arm_vblank_event() before writing out the new hardware state. * * The only way to make this work safely is to prevent the vblank from firing * (and the hardware from committing anything else) until the entire atomic * commit sequence has run to completion. If the hardware does not have such a * feature (e.g. using a "go" bit), then it is unsafe to use this functions. * Instead drivers need to manually send out the event from their interrupt * handler by calling drm_crtc_send_vblank_event() and make sure that there's no * possible race with the hardware committing the atomic update. * * Caller must hold a vblank reference for the event @e acquired by a * drm_crtc_vblank_get(), which will be dropped when the next vblank arrives. */ void drm_crtc_arm_vblank_event(struct drm_crtc *crtc, struct drm_pending_vblank_event *e) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); assert_spin_locked(&dev->event_lock); e->pipe = pipe; e->sequence = drm_crtc_accurate_vblank_count(crtc) + 1; list_add_tail(&e->base.link, &dev->vblank_event_list); } EXPORT_SYMBOL(drm_crtc_arm_vblank_event); /** * drm_crtc_send_vblank_event - helper to send vblank event after pageflip * @crtc: the source CRTC of the vblank event * @e: the event to send * * Updates sequence # and timestamp on event for the most recently processed * vblank, and sends it to userspace. Caller must hold event lock. * * See drm_crtc_arm_vblank_event() for a helper which can be used in certain * situation, especially to send out events for atomic commit operations. */ void drm_crtc_send_vblank_event(struct drm_crtc *crtc, struct drm_pending_vblank_event *e) { struct drm_device *dev = crtc->dev; u64 seq; unsigned int pipe = drm_crtc_index(crtc); ktime_t now; if (drm_dev_has_vblank(dev)) { seq = drm_vblank_count_and_time(dev, pipe, &now); } else { seq = 0; now = ktime_get(); } e->pipe = pipe; send_vblank_event(dev, e, seq, now); } EXPORT_SYMBOL(drm_crtc_send_vblank_event); static int __enable_vblank(struct drm_device *dev, unsigned int pipe) { if (drm_core_check_feature(dev, DRIVER_MODESET)) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); if (drm_WARN_ON(dev, !crtc)) return 0; if (crtc->funcs->enable_vblank) return crtc->funcs->enable_vblank(crtc); } return -EINVAL; } static int drm_vblank_enable(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); int ret = 0; assert_spin_locked(&dev->vbl_lock); spin_lock(&dev->vblank_time_lock); if (!vblank->enabled) { /* * Enable vblank irqs under vblank_time_lock protection. * All vblank count & timestamp updates are held off * until we are done reinitializing master counter and * timestamps. Filtercode in drm_handle_vblank() will * prevent double-accounting of same vblank interval. */ ret = __enable_vblank(dev, pipe); drm_dbg_core(dev, "enabling vblank on crtc %u, ret: %d\n", pipe, ret); if (ret) { atomic_dec(&vblank->refcount); } else { drm_update_vblank_count(dev, pipe, 0); /* drm_update_vblank_count() includes a wmb so we just * need to ensure that the compiler emits the write * to mark the vblank as enabled after the call * to drm_update_vblank_count(). */ WRITE_ONCE(vblank->enabled, true); } } spin_unlock(&dev->vblank_time_lock); return ret; } int drm_vblank_get(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); unsigned long irqflags; int ret = 0; if (!drm_dev_has_vblank(dev)) return -EINVAL; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return -EINVAL; spin_lock_irqsave(&dev->vbl_lock, irqflags); /* Going from 0->1 means we have to enable interrupts again */ if (atomic_add_return(1, &vblank->refcount) == 1) { ret = drm_vblank_enable(dev, pipe); } else { if (!vblank->enabled) { atomic_dec(&vblank->refcount); ret = -EINVAL; } } spin_unlock_irqrestore(&dev->vbl_lock, irqflags); return ret; } /** * drm_crtc_vblank_get - get a reference count on vblank events * @crtc: which CRTC to own * * Acquire a reference count on vblank events to avoid having them disabled * while in use. * * Returns: * Zero on success or a negative error code on failure. */ int drm_crtc_vblank_get(struct drm_crtc *crtc) { return drm_vblank_get(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_vblank_get); void drm_vblank_put(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); int vblank_offdelay = vblank->config.offdelay_ms; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; if (drm_WARN_ON(dev, atomic_read(&vblank->refcount) == 0)) return; /* Last user schedules interrupt disable */ if (atomic_dec_and_test(&vblank->refcount)) { if (!vblank_offdelay) return; else if (vblank_offdelay < 0) vblank_disable_fn(&vblank->disable_timer); else if (!vblank->config.disable_immediate) mod_timer(&vblank->disable_timer, jiffies + ((vblank_offdelay * HZ) / 1000)); } } /** * drm_crtc_vblank_put - give up ownership of vblank events * @crtc: which counter to give up * * Release ownership of a given vblank counter, turning off interrupts * if possible. Disable interrupts after &drm_vblank_crtc_config.offdelay_ms * milliseconds. */ void drm_crtc_vblank_put(struct drm_crtc *crtc) { drm_vblank_put(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_vblank_put); /** * drm_wait_one_vblank - wait for one vblank * @dev: DRM device * @pipe: CRTC index * * This waits for one vblank to pass on @pipe, using the irq driver interfaces. * It is a failure to call this when the vblank irq for @pipe is disabled, e.g. * due to lack of driver support or because the crtc is off. * * This is the legacy version of drm_crtc_wait_one_vblank(). */ void drm_wait_one_vblank(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); int ret; u64 last; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; ret = drm_vblank_get(dev, pipe); if (drm_WARN(dev, ret, "vblank not available on crtc %i, ret=%i\n", pipe, ret)) return; last = drm_vblank_count(dev, pipe); ret = wait_event_timeout(vblank->queue, last != drm_vblank_count(dev, pipe), msecs_to_jiffies(1000)); drm_WARN(dev, ret == 0, "vblank wait timed out on crtc %i\n", pipe); drm_vblank_put(dev, pipe); } EXPORT_SYMBOL(drm_wait_one_vblank); /** * drm_crtc_wait_one_vblank - wait for one vblank * @crtc: DRM crtc * * This waits for one vblank to pass on @crtc, using the irq driver interfaces. * It is a failure to call this when the vblank irq for @crtc is disabled, e.g. * due to lack of driver support or because the crtc is off. */ void drm_crtc_wait_one_vblank(struct drm_crtc *crtc) { drm_wait_one_vblank(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_wait_one_vblank); /** * drm_crtc_vblank_off - disable vblank events on a CRTC * @crtc: CRTC in question * * Drivers can use this function to shut down the vblank interrupt handling when * disabling a crtc. This function ensures that the latest vblank frame count is * stored so that drm_vblank_on can restore it again. * * Drivers must use this function when the hardware vblank counter can get * reset, e.g. when suspending or disabling the @crtc in general. */ void drm_crtc_vblank_off(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); struct drm_pending_vblank_event *e, *t; ktime_t now; u64 seq; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; /* * Grab event_lock early to prevent vblank work from being scheduled * while we're in the middle of shutting down vblank interrupts */ spin_lock_irq(&dev->event_lock); spin_lock(&dev->vbl_lock); drm_dbg_vbl(dev, "crtc %d, vblank enabled %d, inmodeset %d\n", pipe, vblank->enabled, vblank->inmodeset); /* Avoid redundant vblank disables without previous * drm_crtc_vblank_on(). */ if (drm_core_check_feature(dev, DRIVER_ATOMIC) || !vblank->inmodeset) drm_vblank_disable_and_save(dev, pipe); wake_up(&vblank->queue); /* * Prevent subsequent drm_vblank_get() from re-enabling * the vblank interrupt by bumping the refcount. */ if (!vblank->inmodeset) { atomic_inc(&vblank->refcount); vblank->inmodeset = 1; } spin_unlock(&dev->vbl_lock); /* Send any queued vblank events, lest the natives grow disquiet */ seq = drm_vblank_count_and_time(dev, pipe, &now); list_for_each_entry_safe(e, t, &dev->vblank_event_list, base.link) { if (e->pipe != pipe) continue; drm_dbg_core(dev, "Sending premature vblank event on disable: " "wanted %llu, current %llu\n", e->sequence, seq); list_del(&e->base.link); drm_vblank_put(dev, pipe); send_vblank_event(dev, e, seq, now); } /* Cancel any leftover pending vblank work */ drm_vblank_cancel_pending_works(vblank); spin_unlock_irq(&dev->event_lock); /* Will be reset by the modeset helpers when re-enabling the crtc by * calling drm_calc_timestamping_constants(). */ vblank->hwmode.crtc_clock = 0; /* Wait for any vblank work that's still executing to finish */ drm_vblank_flush_worker(vblank); } EXPORT_SYMBOL(drm_crtc_vblank_off); /** * drm_crtc_vblank_reset - reset vblank state to off on a CRTC * @crtc: CRTC in question * * Drivers can use this function to reset the vblank state to off at load time. * Drivers should use this together with the drm_crtc_vblank_off() and * drm_crtc_vblank_on() functions. The difference compared to * drm_crtc_vblank_off() is that this function doesn't save the vblank counter * and hence doesn't need to call any driver hooks. * * This is useful for recovering driver state e.g. on driver load, or on resume. */ void drm_crtc_vblank_reset(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); spin_lock_irq(&dev->vbl_lock); /* * Prevent subsequent drm_vblank_get() from enabling the vblank * interrupt by bumping the refcount. */ if (!vblank->inmodeset) { atomic_inc(&vblank->refcount); vblank->inmodeset = 1; } spin_unlock_irq(&dev->vbl_lock); drm_WARN_ON(dev, !list_empty(&dev->vblank_event_list)); drm_WARN_ON(dev, !list_empty(&vblank->pending_work)); } EXPORT_SYMBOL(drm_crtc_vblank_reset); /** * drm_crtc_set_max_vblank_count - configure the hw max vblank counter value * @crtc: CRTC in question * @max_vblank_count: max hardware vblank counter value * * Update the maximum hardware vblank counter value for @crtc * at runtime. Useful for hardware where the operation of the * hardware vblank counter depends on the currently active * display configuration. * * For example, if the hardware vblank counter does not work * when a specific connector is active the maximum can be set * to zero. And when that specific connector isn't active the * maximum can again be set to the appropriate non-zero value. * * If used, must be called before drm_vblank_on(). */ void drm_crtc_set_max_vblank_count(struct drm_crtc *crtc, u32 max_vblank_count) { struct drm_device *dev = crtc->dev; struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); drm_WARN_ON(dev, dev->max_vblank_count); drm_WARN_ON(dev, !READ_ONCE(vblank->inmodeset)); vblank->max_vblank_count = max_vblank_count; } EXPORT_SYMBOL(drm_crtc_set_max_vblank_count); /** * drm_crtc_vblank_on_config - enable vblank events on a CRTC with custom * configuration options * @crtc: CRTC in question * @config: Vblank configuration value * * See drm_crtc_vblank_on(). In addition, this function allows you to provide a * custom vblank configuration for a given CRTC. * * Note that @config is copied, the pointer does not need to stay valid beyond * this function call. For details of the parameters see * struct drm_vblank_crtc_config. */ void drm_crtc_vblank_on_config(struct drm_crtc *crtc, const struct drm_vblank_crtc_config *config) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; spin_lock_irq(&dev->vbl_lock); drm_dbg_vbl(dev, "crtc %d, vblank enabled %d, inmodeset %d\n", pipe, vblank->enabled, vblank->inmodeset); vblank->config = *config; /* Drop our private "prevent drm_vblank_get" refcount */ if (vblank->inmodeset) { atomic_dec(&vblank->refcount); vblank->inmodeset = 0; } drm_reset_vblank_timestamp(dev, pipe); /* * re-enable interrupts if there are users left, or the * user wishes vblank interrupts to be enabled all the time. */ if (atomic_read(&vblank->refcount) != 0 || !vblank->config.offdelay_ms) drm_WARN_ON(dev, drm_vblank_enable(dev, pipe)); spin_unlock_irq(&dev->vbl_lock); } EXPORT_SYMBOL(drm_crtc_vblank_on_config); /** * drm_crtc_vblank_on - enable vblank events on a CRTC * @crtc: CRTC in question * * This functions restores the vblank interrupt state captured with * drm_crtc_vblank_off() again and is generally called when enabling @crtc. Note * that calls to drm_crtc_vblank_on() and drm_crtc_vblank_off() can be * unbalanced and so can also be unconditionally called in driver load code to * reflect the current hardware state of the crtc. * * Note that unlike in drm_crtc_vblank_on_config(), default values are used. */ void drm_crtc_vblank_on(struct drm_crtc *crtc) { const struct drm_vblank_crtc_config config = { .offdelay_ms = drm_vblank_offdelay, .disable_immediate = crtc->dev->vblank_disable_immediate }; drm_crtc_vblank_on_config(crtc, &config); } EXPORT_SYMBOL(drm_crtc_vblank_on); static void drm_vblank_restore(struct drm_device *dev, unsigned int pipe) { ktime_t t_vblank; struct drm_vblank_crtc *vblank; int framedur_ns; u64 diff_ns; u32 cur_vblank, diff = 1; int count = DRM_TIMESTAMP_MAXRETRIES; u32 max_vblank_count = drm_max_vblank_count(dev, pipe); if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return; assert_spin_locked(&dev->vbl_lock); assert_spin_locked(&dev->vblank_time_lock); vblank = drm_vblank_crtc(dev, pipe); drm_WARN_ONCE(dev, drm_debug_enabled(DRM_UT_VBL) && !vblank->framedur_ns, "Cannot compute missed vblanks without frame duration\n"); framedur_ns = vblank->framedur_ns; do { cur_vblank = __get_vblank_counter(dev, pipe); drm_get_last_vbltimestamp(dev, pipe, &t_vblank, false); } while (cur_vblank != __get_vblank_counter(dev, pipe) && --count > 0); diff_ns = ktime_to_ns(ktime_sub(t_vblank, vblank->time)); if (framedur_ns) diff = DIV_ROUND_CLOSEST_ULL(diff_ns, framedur_ns); drm_dbg_vbl(dev, "missed %d vblanks in %lld ns, frame duration=%d ns, hw_diff=%d\n", diff, diff_ns, framedur_ns, cur_vblank - vblank->last); vblank->last = (cur_vblank - diff) & max_vblank_count; } /** * drm_crtc_vblank_restore - estimate missed vblanks and update vblank count. * @crtc: CRTC in question * * Power manamement features can cause frame counter resets between vblank * disable and enable. Drivers can use this function in their * &drm_crtc_funcs.enable_vblank implementation to estimate missed vblanks since * the last &drm_crtc_funcs.disable_vblank using timestamps and update the * vblank counter. * * Note that drivers must have race-free high-precision timestamping support, * i.e. &drm_crtc_funcs.get_vblank_timestamp must be hooked up and * &drm_vblank_crtc_config.disable_immediate must be set to indicate the * time-stamping functions are race-free against vblank hardware counter * increments. */ void drm_crtc_vblank_restore(struct drm_crtc *crtc) { struct drm_device *dev = crtc->dev; unsigned int pipe = drm_crtc_index(crtc); struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); drm_WARN_ON_ONCE(dev, !crtc->funcs->get_vblank_timestamp); drm_WARN_ON_ONCE(dev, vblank->inmodeset); drm_WARN_ON_ONCE(dev, !vblank->config.disable_immediate); drm_vblank_restore(dev, pipe); } EXPORT_SYMBOL(drm_crtc_vblank_restore); static int drm_queue_vblank_event(struct drm_device *dev, unsigned int pipe, u64 req_seq, union drm_wait_vblank *vblwait, struct drm_file *file_priv) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); struct drm_pending_vblank_event *e; ktime_t now; u64 seq; int ret; e = kzalloc(sizeof(*e), GFP_KERNEL); if (e == NULL) { ret = -ENOMEM; goto err_put; } e->pipe = pipe; e->event.base.type = DRM_EVENT_VBLANK; e->event.base.length = sizeof(e->event.vbl); e->event.vbl.user_data = vblwait->request.signal; e->event.vbl.crtc_id = 0; if (drm_core_check_feature(dev, DRIVER_MODESET)) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); if (crtc) e->event.vbl.crtc_id = crtc->base.id; } spin_lock_irq(&dev->event_lock); /* * drm_crtc_vblank_off() might have been called after we called * drm_vblank_get(). drm_crtc_vblank_off() holds event_lock around the * vblank disable, so no need for further locking. The reference from * drm_vblank_get() protects against vblank disable from another source. */ if (!READ_ONCE(vblank->enabled)) { ret = -EINVAL; goto err_unlock; } ret = drm_event_reserve_init_locked(dev, file_priv, &e->base, &e->event.base); if (ret) goto err_unlock; seq = drm_vblank_count_and_time(dev, pipe, &now); drm_dbg_core(dev, "event on vblank count %llu, current %llu, crtc %u\n", req_seq, seq, pipe); trace_drm_vblank_event_queued(file_priv, pipe, req_seq); e->sequence = req_seq; if (drm_vblank_passed(seq, req_seq)) { drm_vblank_put(dev, pipe); send_vblank_event(dev, e, seq, now); vblwait->reply.sequence = seq; } else { /* drm_handle_vblank_events will call drm_vblank_put */ list_add_tail(&e->base.link, &dev->vblank_event_list); vblwait->reply.sequence = req_seq; } spin_unlock_irq(&dev->event_lock); return 0; err_unlock: spin_unlock_irq(&dev->event_lock); kfree(e); err_put: drm_vblank_put(dev, pipe); return ret; } static bool drm_wait_vblank_is_query(union drm_wait_vblank *vblwait) { if (vblwait->request.sequence) return false; return _DRM_VBLANK_RELATIVE == (vblwait->request.type & (_DRM_VBLANK_TYPES_MASK | _DRM_VBLANK_EVENT | _DRM_VBLANK_NEXTONMISS)); } /* * Widen a 32-bit param to 64-bits. * * \param narrow 32-bit value (missing upper 32 bits) * \param near 64-bit value that should be 'close' to near * * This function returns a 64-bit value using the lower 32-bits from * 'narrow' and constructing the upper 32-bits so that the result is * as close as possible to 'near'. */ static u64 widen_32_to_64(u32 narrow, u64 near) { return near + (s32) (narrow - near); } static void drm_wait_vblank_reply(struct drm_device *dev, unsigned int pipe, struct drm_wait_vblank_reply *reply) { ktime_t now; struct timespec64 ts; /* * drm_wait_vblank_reply is a UAPI structure that uses 'long' * to store the seconds. This is safe as we always use monotonic * timestamps since linux-4.15. */ reply->sequence = drm_vblank_count_and_time(dev, pipe, &now); ts = ktime_to_timespec64(now); reply->tval_sec = (u32)ts.tv_sec; reply->tval_usec = ts.tv_nsec / 1000; } static bool drm_wait_vblank_supported(struct drm_device *dev) { return drm_dev_has_vblank(dev); } int drm_wait_vblank_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_crtc *crtc; struct drm_vblank_crtc *vblank; union drm_wait_vblank *vblwait = data; int ret; u64 req_seq, seq; unsigned int pipe_index; unsigned int flags, pipe, high_pipe; if (!drm_wait_vblank_supported(dev)) return -EOPNOTSUPP; if (vblwait->request.type & _DRM_VBLANK_SIGNAL) return -EINVAL; if (vblwait->request.type & ~(_DRM_VBLANK_TYPES_MASK | _DRM_VBLANK_FLAGS_MASK | _DRM_VBLANK_HIGH_CRTC_MASK)) { drm_dbg_core(dev, "Unsupported type value 0x%x, supported mask 0x%x\n", vblwait->request.type, (_DRM_VBLANK_TYPES_MASK | _DRM_VBLANK_FLAGS_MASK | _DRM_VBLANK_HIGH_CRTC_MASK)); return -EINVAL; } flags = vblwait->request.type & _DRM_VBLANK_FLAGS_MASK; high_pipe = (vblwait->request.type & _DRM_VBLANK_HIGH_CRTC_MASK); if (high_pipe) pipe_index = high_pipe >> _DRM_VBLANK_HIGH_CRTC_SHIFT; else pipe_index = flags & _DRM_VBLANK_SECONDARY ? 1 : 0; /* Convert lease-relative crtc index into global crtc index */ if (drm_core_check_feature(dev, DRIVER_MODESET)) { pipe = 0; drm_for_each_crtc(crtc, dev) { if (drm_lease_held(file_priv, crtc->base.id)) { if (pipe_index == 0) break; pipe_index--; } pipe++; } } else { pipe = pipe_index; } if (pipe >= dev->num_crtcs) return -EINVAL; vblank = &dev->vblank[pipe]; /* If the counter is currently enabled and accurate, short-circuit * queries to return the cached timestamp of the last vblank. */ if (vblank->config.disable_immediate && drm_wait_vblank_is_query(vblwait) && READ_ONCE(vblank->enabled)) { drm_wait_vblank_reply(dev, pipe, &vblwait->reply); return 0; } ret = drm_vblank_get(dev, pipe); if (ret) { drm_dbg_core(dev, "crtc %d failed to acquire vblank counter, %d\n", pipe, ret); return ret; } seq = drm_vblank_count(dev, pipe); switch (vblwait->request.type & _DRM_VBLANK_TYPES_MASK) { case _DRM_VBLANK_RELATIVE: req_seq = seq + vblwait->request.sequence; vblwait->request.sequence = req_seq; vblwait->request.type &= ~_DRM_VBLANK_RELATIVE; break; case _DRM_VBLANK_ABSOLUTE: req_seq = widen_32_to_64(vblwait->request.sequence, seq); break; default: ret = -EINVAL; goto done; } if ((flags & _DRM_VBLANK_NEXTONMISS) && drm_vblank_passed(seq, req_seq)) { req_seq = seq + 1; vblwait->request.type &= ~_DRM_VBLANK_NEXTONMISS; vblwait->request.sequence = req_seq; } if (flags & _DRM_VBLANK_EVENT) { /* must hold on to the vblank ref until the event fires * drm_vblank_put will be called asynchronously */ return drm_queue_vblank_event(dev, pipe, req_seq, vblwait, file_priv); } if (req_seq != seq) { int wait; drm_dbg_core(dev, "waiting on vblank count %llu, crtc %u\n", req_seq, pipe); wait = wait_event_interruptible_timeout(vblank->queue, drm_vblank_passed(drm_vblank_count(dev, pipe), req_seq) || !READ_ONCE(vblank->enabled), msecs_to_jiffies(3000)); switch (wait) { case 0: /* timeout */ ret = -EBUSY; break; case -ERESTARTSYS: /* interrupted by signal */ ret = -EINTR; break; default: ret = 0; break; } } if (ret != -EINTR) { drm_wait_vblank_reply(dev, pipe, &vblwait->reply); drm_dbg_core(dev, "crtc %d returning %u to client\n", pipe, vblwait->reply.sequence); } else { drm_dbg_core(dev, "crtc %d vblank wait interrupted by signal\n", pipe); } done: drm_vblank_put(dev, pipe); return ret; } static void drm_handle_vblank_events(struct drm_device *dev, unsigned int pipe) { struct drm_crtc *crtc = drm_crtc_from_index(dev, pipe); bool high_prec = false; struct drm_pending_vblank_event *e, *t; ktime_t now; u64 seq; assert_spin_locked(&dev->event_lock); seq = drm_vblank_count_and_time(dev, pipe, &now); list_for_each_entry_safe(e, t, &dev->vblank_event_list, base.link) { if (e->pipe != pipe) continue; if (!drm_vblank_passed(seq, e->sequence)) continue; drm_dbg_core(dev, "vblank event on %llu, current %llu\n", e->sequence, seq); list_del(&e->base.link); drm_vblank_put(dev, pipe); send_vblank_event(dev, e, seq, now); } if (crtc && crtc->funcs->get_vblank_timestamp) high_prec = true; trace_drm_vblank_event(pipe, seq, now, high_prec); } /** * drm_handle_vblank - handle a vblank event * @dev: DRM device * @pipe: index of CRTC where this event occurred * * Drivers should call this routine in their vblank interrupt handlers to * update the vblank counter and send any signals that may be pending. * * This is the legacy version of drm_crtc_handle_vblank(). */ bool drm_handle_vblank(struct drm_device *dev, unsigned int pipe) { struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe); unsigned long irqflags; bool disable_irq; if (drm_WARN_ON_ONCE(dev, !drm_dev_has_vblank(dev))) return false; if (drm_WARN_ON(dev, pipe >= dev->num_crtcs)) return false; spin_lock_irqsave(&dev->event_lock, irqflags); /* Need timestamp lock to prevent concurrent execution with * vblank enable/disable, as this would cause inconsistent * or corrupted timestamps and vblank counts. */ spin_lock(&dev->vblank_time_lock); /* Vblank irq handling disabled. Nothing to do. */ if (!vblank->enabled) { spin_unlock(&dev->vblank_time_lock); spin_unlock_irqrestore(&dev->event_lock, irqflags); return false; } drm_update_vblank_count(dev, pipe, true); spin_unlock(&dev->vblank_time_lock); wake_up(&vblank->queue); /* With instant-off, we defer disabling the interrupt until after * we finish processing the following vblank after all events have * been signaled. The disable has to be last (after * drm_handle_vblank_events) so that the timestamp is always accurate. */ disable_irq = (vblank->config.disable_immediate && vblank->config.offdelay_ms > 0 && !atomic_read(&vblank->refcount)); drm_handle_vblank_events(dev, pipe); drm_handle_vblank_works(vblank); spin_unlock_irqrestore(&dev->event_lock, irqflags); if (disable_irq) vblank_disable_fn(&vblank->disable_timer); return true; } EXPORT_SYMBOL(drm_handle_vblank); /** * drm_crtc_handle_vblank - handle a vblank event * @crtc: where this event occurred * * Drivers should call this routine in their vblank interrupt handlers to * update the vblank counter and send any signals that may be pending. * * This is the native KMS version of drm_handle_vblank(). * * Note that for a given vblank counter value drm_crtc_handle_vblank() * and drm_crtc_vblank_count() or drm_crtc_vblank_count_and_time() * provide a barrier: Any writes done before calling * drm_crtc_handle_vblank() will be visible to callers of the later * functions, if the vblank count is the same or a later one. * * See also &drm_vblank_crtc.count. * * Returns: * True if the event was successfully handled, false on failure. */ bool drm_crtc_handle_vblank(struct drm_crtc *crtc) { return drm_handle_vblank(crtc->dev, drm_crtc_index(crtc)); } EXPORT_SYMBOL(drm_crtc_handle_vblank); /* * Get crtc VBLANK count. * * \param dev DRM device * \param data user argument, pointing to a drm_crtc_get_sequence structure. * \param file_priv drm file private for the user's open file descriptor */ int drm_crtc_get_sequence_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_crtc *crtc; struct drm_vblank_crtc *vblank; int pipe; struct drm_crtc_get_sequence *get_seq = data; ktime_t now; bool vblank_enabled; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; if (!drm_dev_has_vblank(dev)) return -EOPNOTSUPP; crtc = drm_crtc_find(dev, file_priv, get_seq->crtc_id); if (!crtc) return -ENOENT; pipe = drm_crtc_index(crtc); vblank = drm_crtc_vblank_crtc(crtc); vblank_enabled = READ_ONCE(vblank->config.disable_immediate) && READ_ONCE(vblank->enabled); if (!vblank_enabled) { ret = drm_crtc_vblank_get(crtc); if (ret) { drm_dbg_core(dev, "crtc %d failed to acquire vblank counter, %d\n", pipe, ret); return ret; } } drm_modeset_lock(&crtc->mutex, NULL); if (crtc->state) get_seq->active = crtc->state->enable; else get_seq->active = crtc->enabled; drm_modeset_unlock(&crtc->mutex); get_seq->sequence = drm_vblank_count_and_time(dev, pipe, &now); get_seq->sequence_ns = ktime_to_ns(now); if (!vblank_enabled) drm_crtc_vblank_put(crtc); return 0; } /* * Queue a event for VBLANK sequence * * \param dev DRM device * \param data user argument, pointing to a drm_crtc_queue_sequence structure. * \param file_priv drm file private for the user's open file descriptor */ int drm_crtc_queue_sequence_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_crtc *crtc; struct drm_vblank_crtc *vblank; int pipe; struct drm_crtc_queue_sequence *queue_seq = data; ktime_t now; struct drm_pending_vblank_event *e; u32 flags; u64 seq; u64 req_seq; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; if (!drm_dev_has_vblank(dev)) return -EOPNOTSUPP; crtc = drm_crtc_find(dev, file_priv, queue_seq->crtc_id); if (!crtc) return -ENOENT; flags = queue_seq->flags; /* Check valid flag bits */ if (flags & ~(DRM_CRTC_SEQUENCE_RELATIVE| DRM_CRTC_SEQUENCE_NEXT_ON_MISS)) return -EINVAL; pipe = drm_crtc_index(crtc); vblank = drm_crtc_vblank_crtc(crtc); e = kzalloc(sizeof(*e), GFP_KERNEL); if (e == NULL) return -ENOMEM; ret = drm_crtc_vblank_get(crtc); if (ret) { drm_dbg_core(dev, "crtc %d failed to acquire vblank counter, %d\n", pipe, ret); goto err_free; } seq = drm_vblank_count_and_time(dev, pipe, &now); req_seq = queue_seq->sequence; if (flags & DRM_CRTC_SEQUENCE_RELATIVE) req_seq += seq; if ((flags & DRM_CRTC_SEQUENCE_NEXT_ON_MISS) && drm_vblank_passed(seq, req_seq)) req_seq = seq + 1; e->pipe = pipe; e->event.base.type = DRM_EVENT_CRTC_SEQUENCE; e->event.base.length = sizeof(e->event.seq); e->event.seq.user_data = queue_seq->user_data; spin_lock_irq(&dev->event_lock); /* * drm_crtc_vblank_off() might have been called after we called * drm_crtc_vblank_get(). drm_crtc_vblank_off() holds event_lock around the * vblank disable, so no need for further locking. The reference from * drm_crtc_vblank_get() protects against vblank disable from another source. */ if (!READ_ONCE(vblank->enabled)) { ret = -EINVAL; goto err_unlock; } ret = drm_event_reserve_init_locked(dev, file_priv, &e->base, &e->event.base); if (ret) goto err_unlock; e->sequence = req_seq; if (drm_vblank_passed(seq, req_seq)) { drm_crtc_vblank_put(crtc); send_vblank_event(dev, e, seq, now); queue_seq->sequence = seq; } else { /* drm_handle_vblank_events will call drm_vblank_put */ list_add_tail(&e->base.link, &dev->vblank_event_list); queue_seq->sequence = req_seq; } spin_unlock_irq(&dev->event_lock); return 0; err_unlock: spin_unlock_irq(&dev->event_lock); drm_crtc_vblank_put(crtc); err_free: kfree(e); return ret; } /* * VBLANK timer */ static enum hrtimer_restart drm_vblank_timer_function(struct hrtimer *timer) { struct drm_vblank_crtc_timer *vtimer = container_of(timer, struct drm_vblank_crtc_timer, timer); struct drm_crtc *crtc = vtimer->crtc; const struct drm_crtc_helper_funcs *crtc_funcs = crtc->helper_private; struct drm_device *dev = crtc->dev; unsigned long flags; ktime_t interval; u64 ret_overrun; bool succ; spin_lock_irqsave(&vtimer->interval_lock, flags); interval = vtimer->interval; spin_unlock_irqrestore(&vtimer->interval_lock, flags); if (!interval) return HRTIMER_NORESTART; ret_overrun = hrtimer_forward_now(&vtimer->timer, interval); if (ret_overrun != 1) drm_dbg_vbl(dev, "vblank timer overrun\n"); if (crtc_funcs->handle_vblank_timeout) succ = crtc_funcs->handle_vblank_timeout(crtc); else succ = drm_crtc_handle_vblank(crtc); if (!succ) return HRTIMER_NORESTART; return HRTIMER_RESTART; } /** * drm_crtc_vblank_start_timer - Starts the vblank timer on the given CRTC * @crtc: the CRTC * * Drivers should call this function from their CRTC's enable_vblank * function to start a vblank timer. The timer will fire after the duration * of a full frame. drm_crtc_vblank_cancel_timer() disables a running timer. * * Returns: * 0 on success, or a negative errno code otherwise. */ int drm_crtc_vblank_start_timer(struct drm_crtc *crtc) { struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); struct drm_vblank_crtc_timer *vtimer = &vblank->vblank_timer; unsigned long flags; if (!vtimer->crtc) { /* * Set up the data structures on the first invocation. */ vtimer->crtc = crtc; spin_lock_init(&vtimer->interval_lock); hrtimer_setup(&vtimer->timer, drm_vblank_timer_function, CLOCK_MONOTONIC, HRTIMER_MODE_REL); } else { /* * Timer should not be active. If it is, wait for the * previous cancel operations to finish. */ while (hrtimer_active(&vtimer->timer)) hrtimer_try_to_cancel(&vtimer->timer); } drm_calc_timestamping_constants(crtc, &crtc->mode); spin_lock_irqsave(&vtimer->interval_lock, flags); vtimer->interval = ns_to_ktime(vblank->framedur_ns); spin_unlock_irqrestore(&vtimer->interval_lock, flags); hrtimer_start(&vtimer->timer, vtimer->interval, HRTIMER_MODE_REL); return 0; } EXPORT_SYMBOL(drm_crtc_vblank_start_timer); /** * drm_crtc_vblank_cancel_timer - Cancels the given CRTC's vblank timer * @crtc: the CRTC * * Drivers should call this function from their CRTC's disable_vblank * function to stop a vblank timer. */ void drm_crtc_vblank_cancel_timer(struct drm_crtc *crtc) { struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); struct drm_vblank_crtc_timer *vtimer = &vblank->vblank_timer; unsigned long flags; /* * Calling hrtimer_cancel() can result in a deadlock with DRM's * vblank_time_lime_lock and hrtimers' softirq_expiry_lock. So * clear interval and indicate cancellation. The timer function * will cancel itself on the next invocation. */ spin_lock_irqsave(&vtimer->interval_lock, flags); vtimer->interval = 0; spin_unlock_irqrestore(&vtimer->interval_lock, flags); hrtimer_try_to_cancel(&vtimer->timer); } EXPORT_SYMBOL(drm_crtc_vblank_cancel_timer); /** * drm_crtc_vblank_get_vblank_timeout - Returns the vblank timeout * @crtc: The CRTC * @vblank_time: Returns the next vblank timestamp * * The helper drm_crtc_vblank_get_vblank_timeout() returns the next vblank * timestamp of the CRTC's vblank timer according to the timer's expiry * time. */ void drm_crtc_vblank_get_vblank_timeout(struct drm_crtc *crtc, ktime_t *vblank_time) { struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc); struct drm_vblank_crtc_timer *vtimer = &vblank->vblank_timer; u64 cur_count; ktime_t cur_time; if (!READ_ONCE(vblank->enabled)) { *vblank_time = ktime_get(); return; } /* * A concurrent vblank timeout could update the expires field before * we compare it with the vblank time. Hence we'd compare the old * expiry time to the new vblank time; deducing the timer had already * expired. Reread until we get consistent values from both fields. */ do { cur_count = drm_crtc_vblank_count_and_time(crtc, &cur_time); *vblank_time = READ_ONCE(vtimer->timer.node.expires); } while (cur_count != drm_crtc_vblank_count_and_time(crtc, &cur_time)); if (drm_WARN_ON(crtc->dev, !ktime_compare(*vblank_time, cur_time))) return; /* Already expired */ /* * To prevent races we roll the hrtimer forward before we do any * interrupt processing - this is how real hw works (the interrupt * is only generated after all the vblank registers are updated) * and what the vblank core expects. Therefore we need to always * correct the timestamp by one frame. */ *vblank_time = ktime_sub(*vblank_time, vtimer->interval); } EXPORT_SYMBOL(drm_crtc_vblank_get_vblank_timeout);
5544 5543 5494 5543 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PAGE_EXT_H #define __LINUX_PAGE_EXT_H #include <linux/types.h> #include <linux/mmzone.h> #include <linux/stacktrace.h> struct pglist_data; #ifdef CONFIG_PAGE_EXTENSION /** * struct page_ext_operations - per page_ext client operations * @offset: Offset to the client's data within page_ext. Offset is returned to * the client by page_ext_init. * @size: The size of the client data within page_ext. * @need: Function that returns true if client requires page_ext. * @init: (optional) Called to initialize client once page_exts are allocated. * @need_shared_flags: True when client is using shared page_ext->flags * field. * * Each Page Extension client must define page_ext_operations in * page_ext_ops array. */ struct page_ext_operations { size_t offset; size_t size; bool (*need)(void); void (*init)(void); bool need_shared_flags; }; /* * The page_ext_flags users must set need_shared_flags to true. */ enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) PAGE_EXT_YOUNG, PAGE_EXT_IDLE, #endif }; /* * Page Extension can be considered as an extended mem_map. * A page_ext page is associated with every page descriptor. The * page_ext helps us add more information about the page. * All page_ext are allocated at boot or memory hotplug event, * then the page_ext for pfn always exists. */ struct page_ext { unsigned long flags; }; extern bool early_page_ext; extern unsigned long page_ext_size; extern void pgdat_page_ext_init(struct pglist_data *pgdat); static inline bool early_page_ext_enabled(void) { return early_page_ext; } #ifdef CONFIG_SPARSEMEM static inline void page_ext_init_flatmem(void) { } extern void page_ext_init(void); static inline void page_ext_init_flatmem_late(void) { } static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) { /* * page_ext is allocated per memory section. Once we cross a * memory section, we have to fetch the new pointer. */ return next_pfn % PAGES_PER_SECTION; } #else extern void page_ext_init_flatmem(void); extern void page_ext_init_flatmem_late(void); static inline void page_ext_init(void) { } static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn) { return true; } #endif extern struct page_ext *page_ext_get(const struct page *page); extern void page_ext_put(struct page_ext *page_ext); extern struct page_ext *page_ext_lookup(unsigned long pfn); static inline void *page_ext_data(struct page_ext *page_ext, struct page_ext_operations *ops) { return (void *)(page_ext) + ops->offset; } static inline struct page_ext *page_ext_next(struct page_ext *curr) { void *next = curr; next += page_ext_size; return next; } struct page_ext_iter { unsigned long index; unsigned long start_pfn; struct page_ext *page_ext; }; /** * page_ext_iter_begin() - Prepare for iterating through page extensions. * @iter: page extension iterator. * @pfn: PFN of the page we're interested in. * * Must be called with RCU read lock taken. * * Return: NULL if no page_ext exists for this page. */ static inline struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter, unsigned long pfn) { iter->index = 0; iter->start_pfn = pfn; iter->page_ext = page_ext_lookup(pfn); return iter->page_ext; } /** * page_ext_iter_next() - Get next page extension * @iter: page extension iterator. * * Must be called with RCU read lock taken. * * Return: NULL if no next page_ext exists. */ static inline struct page_ext *page_ext_iter_next(struct page_ext_iter *iter) { unsigned long pfn; if (WARN_ON_ONCE(!iter->page_ext)) return NULL; iter->index++; pfn = iter->start_pfn + iter->index; if (page_ext_iter_next_fast_possible(pfn)) iter->page_ext = page_ext_next(iter->page_ext); else iter->page_ext = page_ext_lookup(pfn); return iter->page_ext; } /** * page_ext_iter_get() - Get current page extension * @iter: page extension iterator. * * Return: NULL if no page_ext exists for this iterator. */ static inline struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter) { return iter->page_ext; } /** * for_each_page_ext(): iterate through page_ext objects. * @__page: the page we're interested in * @__pgcount: how many pages to iterate through * @__page_ext: struct page_ext pointer where the current page_ext * object is returned * @__iter: struct page_ext_iter object (defined in the stack) * * IMPORTANT: must be called with RCU read lock taken. */ #define for_each_page_ext(__page, __pgcount, __page_ext, __iter) \ for (__page_ext = page_ext_iter_begin(&__iter, page_to_pfn(__page));\ __page_ext && __iter.index < __pgcount; \ __page_ext = page_ext_iter_next(&__iter)) #else /* !CONFIG_PAGE_EXTENSION */ struct page_ext; static inline bool early_page_ext_enabled(void) { return false; } static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } static inline void page_ext_init(void) { } static inline void page_ext_init_flatmem_late(void) { } static inline void page_ext_init_flatmem(void) { } static inline struct page_ext *page_ext_get(const struct page *page) { return NULL; } static inline void page_ext_put(struct page_ext *page_ext) { } #endif /* CONFIG_PAGE_EXTENSION */ #endif /* __LINUX_PAGE_EXT_H */
5 5 5 5 2 2 5 5 5 5 4 4 4 5 5 4 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 // SPDX-License-Identifier: GPL-2.0-only /* * In memory quota format relies on quota infrastructure to store dquot * information for us. While conventional quota formats for file systems * with persistent storage can load quota information into dquot from the * storage on-demand and hence quota dquot shrinker can free any dquot * that is not currently being used, it must be avoided here. Otherwise we * can lose valuable information, user provided limits, because there is * no persistent storage to load the information from afterwards. * * One information that in-memory quota format needs to keep track of is * a sorted list of ids for each quota type. This is done by utilizing * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota * type. * * This format can be used to support quota on file system without persistent * storage such as tmpfs. * * Author: Lukas Czerner <lczerner@redhat.com> * Carlos Maiolino <cmaiolino@redhat.com> * * Copyright (C) 2023 Red Hat, Inc. */ #include <linux/errno.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/shmem_fs.h> #include <linux/quotaops.h> #include <linux/quota.h> /* * The following constants define the amount of time given a user * before the soft limits are treated as hard limits (usually resulting * in an allocation failure). The timer is started when the user crosses * their soft limit, it is reset when they go below their soft limit. */ #define SHMEM_MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ #define SHMEM_MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ struct quota_id { struct rb_node node; qid_t id; qsize_t bhardlimit; qsize_t bsoftlimit; qsize_t ihardlimit; qsize_t isoftlimit; }; static int shmem_check_quota_file(struct super_block *sb, int type) { /* There is no real quota file, nothing to do */ return 1; } /* * There is no real quota file. Just allocate rb_root for quota ids and * set limits */ static int shmem_read_file_info(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); struct mem_dqinfo *info = &dqopt->info[type]; info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS); if (!info->dqi_priv) return -ENOMEM; info->dqi_max_spc_limit = SHMEM_QUOTA_MAX_SPC_LIMIT; info->dqi_max_ino_limit = SHMEM_QUOTA_MAX_INO_LIMIT; info->dqi_bgrace = SHMEM_MAX_DQ_TIME; info->dqi_igrace = SHMEM_MAX_IQ_TIME; info->dqi_flags = 0; return 0; } static int shmem_write_file_info(struct super_block *sb, int type) { /* There is no real quota file, nothing to do */ return 0; } /* * Free all the quota_id entries in the rb tree and rb_root. */ static int shmem_free_file_info(struct super_block *sb, int type) { struct mem_dqinfo *info = &sb_dqopt(sb)->info[type]; struct rb_root *root = info->dqi_priv; struct quota_id *entry; struct rb_node *node; info->dqi_priv = NULL; node = rb_first(root); while (node) { entry = rb_entry(node, struct quota_id, node); node = rb_next(&entry->node); rb_erase(&entry->node, root); kfree(entry); } kfree(root); return 0; } static int shmem_get_next_id(struct super_block *sb, struct kqid *qid) { struct mem_dqinfo *info = sb_dqinfo(sb, qid->type); struct rb_node *node; qid_t id = from_kqid(&init_user_ns, *qid); struct quota_info *dqopt = sb_dqopt(sb); struct quota_id *entry = NULL; int ret = 0; if (!sb_has_quota_active(sb, qid->type)) return -ESRCH; down_read(&dqopt->dqio_sem); node = ((struct rb_root *)info->dqi_priv)->rb_node; while (node) { entry = rb_entry(node, struct quota_id, node); if (id < entry->id) node = node->rb_left; else if (id > entry->id) node = node->rb_right; else goto got_next_id; } if (!entry) { ret = -ENOENT; goto out_unlock; } if (id > entry->id) { node = rb_next(&entry->node); if (!node) { ret = -ENOENT; goto out_unlock; } entry = rb_entry(node, struct quota_id, node); } got_next_id: *qid = make_kqid(&init_user_ns, qid->type, entry->id); out_unlock: up_read(&dqopt->dqio_sem); return ret; } /* * Load dquot with limits from existing entry, or create the new entry if * it does not exist. */ static int shmem_acquire_dquot(struct dquot *dquot) { struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); struct rb_node **n; struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info; struct rb_node *parent = NULL, *new_node = NULL; struct quota_id *new_entry, *entry; qid_t id = from_kqid(&init_user_ns, dquot->dq_id); struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); int ret = 0; mutex_lock(&dquot->dq_lock); down_write(&dqopt->dqio_sem); n = &((struct rb_root *)info->dqi_priv)->rb_node; while (*n) { parent = *n; entry = rb_entry(parent, struct quota_id, node); if (id < entry->id) n = &(*n)->rb_left; else if (id > entry->id) n = &(*n)->rb_right; else goto found; } /* We don't have entry for this id yet, create it */ new_entry = kzalloc(sizeof(struct quota_id), GFP_NOFS); if (!new_entry) { ret = -ENOMEM; goto out_unlock; } new_entry->id = id; if (dquot->dq_id.type == USRQUOTA) { new_entry->bhardlimit = sbinfo->qlimits.usrquota_bhardlimit; new_entry->ihardlimit = sbinfo->qlimits.usrquota_ihardlimit; } else if (dquot->dq_id.type == GRPQUOTA) { new_entry->bhardlimit = sbinfo->qlimits.grpquota_bhardlimit; new_entry->ihardlimit = sbinfo->qlimits.grpquota_ihardlimit; } new_node = &new_entry->node; rb_link_node(new_node, parent, n); rb_insert_color(new_node, (struct rb_root *)info->dqi_priv); entry = new_entry; found: /* Load the stored limits from the tree */ spin_lock(&dquot->dq_dqb_lock); dquot->dq_dqb.dqb_bhardlimit = entry->bhardlimit; dquot->dq_dqb.dqb_bsoftlimit = entry->bsoftlimit; dquot->dq_dqb.dqb_ihardlimit = entry->ihardlimit; dquot->dq_dqb.dqb_isoftlimit = entry->isoftlimit; if (!dquot->dq_dqb.dqb_bhardlimit && !dquot->dq_dqb.dqb_bsoftlimit && !dquot->dq_dqb.dqb_ihardlimit && !dquot->dq_dqb.dqb_isoftlimit) set_bit(DQ_FAKE_B, &dquot->dq_flags); spin_unlock(&dquot->dq_dqb_lock); /* Make sure flags update is visible after dquot has been filled */ smp_mb__before_atomic(); set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_unlock: up_write(&dqopt->dqio_sem); mutex_unlock(&dquot->dq_lock); return ret; } static bool shmem_is_empty_dquot(struct dquot *dquot) { struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info; qsize_t bhardlimit; qsize_t ihardlimit; if (dquot->dq_id.type == USRQUOTA) { bhardlimit = sbinfo->qlimits.usrquota_bhardlimit; ihardlimit = sbinfo->qlimits.usrquota_ihardlimit; } else if (dquot->dq_id.type == GRPQUOTA) { bhardlimit = sbinfo->qlimits.grpquota_bhardlimit; ihardlimit = sbinfo->qlimits.grpquota_ihardlimit; } if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || (dquot->dq_dqb.dqb_curspace == 0 && dquot->dq_dqb.dqb_curinodes == 0 && dquot->dq_dqb.dqb_bhardlimit == bhardlimit && dquot->dq_dqb.dqb_ihardlimit == ihardlimit)) return true; return false; } /* * Store limits from dquot in the tree unless it's fake. If it is fake * remove the id from the tree since there is no useful information in * there. */ static int shmem_release_dquot(struct dquot *dquot) { struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); struct rb_node *node; qid_t id = from_kqid(&init_user_ns, dquot->dq_id); struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); struct quota_id *entry = NULL; mutex_lock(&dquot->dq_lock); /* Check whether we are not racing with some other dqget() */ if (dquot_is_busy(dquot)) goto out_dqlock; down_write(&dqopt->dqio_sem); node = ((struct rb_root *)info->dqi_priv)->rb_node; while (node) { entry = rb_entry(node, struct quota_id, node); if (id < entry->id) node = node->rb_left; else if (id > entry->id) node = node->rb_right; else goto found; } /* We should always find the entry in the rb tree */ WARN_ONCE(1, "quota id %u from dquot %p, not in rb tree!\n", id, dquot); up_write(&dqopt->dqio_sem); mutex_unlock(&dquot->dq_lock); return -ENOENT; found: if (shmem_is_empty_dquot(dquot)) { /* Remove entry from the tree */ rb_erase(&entry->node, info->dqi_priv); kfree(entry); } else { /* Store the limits in the tree */ spin_lock(&dquot->dq_dqb_lock); entry->bhardlimit = dquot->dq_dqb.dqb_bhardlimit; entry->bsoftlimit = dquot->dq_dqb.dqb_bsoftlimit; entry->ihardlimit = dquot->dq_dqb.dqb_ihardlimit; entry->isoftlimit = dquot->dq_dqb.dqb_isoftlimit; spin_unlock(&dquot->dq_dqb_lock); } clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); up_write(&dqopt->dqio_sem); out_dqlock: mutex_unlock(&dquot->dq_lock); return 0; } static int shmem_mark_dquot_dirty(struct dquot *dquot) { return 0; } static int shmem_dquot_write_info(struct super_block *sb, int type) { return 0; } static const struct quota_format_ops shmem_format_ops = { .check_quota_file = shmem_check_quota_file, .read_file_info = shmem_read_file_info, .write_file_info = shmem_write_file_info, .free_file_info = shmem_free_file_info, }; struct quota_format_type shmem_quota_format = { .qf_fmt_id = QFMT_SHMEM, .qf_ops = &shmem_format_ops, .qf_owner = THIS_MODULE }; const struct dquot_operations shmem_quota_operations = { .acquire_dquot = shmem_acquire_dquot, .release_dquot = shmem_release_dquot, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .write_info = shmem_dquot_write_info, .mark_dirty = shmem_mark_dquot_dirty, .get_next_id = shmem_get_next_id, };
363 6 6 322 317 313 213 318 315 320 141 141 141 142 140 6 6 6 6 6 211 324 210 33 3 3 3 3 327 3 325 326 208 209 327 326 324 326 132 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/main.c - Where the driver meets power management. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * * The driver model core calls device_pm_add() when a device is registered. * This will initialize the embedded device_pm_info object in the device * and add it to the list of power-controlled devices. sysfs entries for * controlling device power management will also be added. * * A separate list is used for keeping track of power info, because the power * domain dependencies may differ from the ancestral dependencies that the * subsystem list maintains. */ #define pr_fmt(fmt) "PM: " fmt #define dev_fmt pr_fmt #include <linux/device.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/pm_runtime.h> #include <linux/pm-trace.h> #include <linux/pm_wakeirq.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/async.h> #include <linux/suspend.h> #include <trace/events/power.h> #include <linux/cpufreq.h> #include <linux/devfreq.h> #include <linux/timer.h> #include <linux/nmi.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); /* * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * * Since device_pm_add() may be called with a device lock held, * we must never try to acquire a device lock while holding * dpm_list_mutex. */ LIST_HEAD(dpm_list); static LIST_HEAD(dpm_prepared_list); static LIST_HEAD(dpm_suspended_list); static LIST_HEAD(dpm_late_early_list); static LIST_HEAD(dpm_noirq_list); static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; static DEFINE_MUTEX(async_wip_mtx); static int async_error; /** * pm_hibernate_is_recovering - if recovering from hibernate due to error. * * Used to query if dev_pm_ops.thaw() is called for normal hibernation case or * recovering from some error. * * Return: true for error case, false for normal case. */ bool pm_hibernate_is_recovering(void) { return pm_transition.event == PM_EVENT_RECOVER; } EXPORT_SYMBOL_GPL(pm_hibernate_is_recovering); static const char *pm_verb(int event) { switch (event) { case PM_EVENT_SUSPEND: return "suspend"; case PM_EVENT_RESUME: return "resume"; case PM_EVENT_FREEZE: return "freeze"; case PM_EVENT_QUIESCE: return "quiesce"; case PM_EVENT_HIBERNATE: return "hibernate"; case PM_EVENT_THAW: return "thaw"; case PM_EVENT_RESTORE: return "restore"; case PM_EVENT_RECOVER: return "recover"; case PM_EVENT_POWEROFF: return "poweroff"; default: return "(unknown PM event)"; } } /** * device_pm_sleep_init - Initialize system suspend-related device fields. * @dev: Device object being initialized. */ void device_pm_sleep_init(struct device *dev) { dev->power.is_prepared = false; dev->power.is_suspended = false; dev->power.is_noirq_suspended = false; dev->power.is_late_suspended = false; init_completion(&dev->power.completion); complete_all(&dev->power.completion); dev->power.wakeup = NULL; INIT_LIST_HEAD(&dev->power.entry); } /** * device_pm_lock - Lock the list of active devices used by the PM core. */ void device_pm_lock(void) { mutex_lock(&dpm_list_mtx); } /** * device_pm_unlock - Unlock the list of active devices used by the PM core. */ void device_pm_unlock(void) { mutex_unlock(&dpm_list_mtx); } /** * device_pm_add - Add a device to the PM core's list of active devices. * @dev: Device to add to the list. */ void device_pm_add(struct device *dev) { /* Skip PM setup/initialization. */ if (device_pm_not_required(dev)) return; pr_debug("Adding info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); device_pm_check_callbacks(dev); mutex_lock(&dpm_list_mtx); if (dev->parent && dev->parent->power.is_prepared) dev_warn(dev, "parent %s should not be sleeping\n", dev_name(dev->parent)); list_add_tail(&dev->power.entry, &dpm_list); dev->power.in_dpm_list = true; mutex_unlock(&dpm_list_mtx); } /** * device_pm_remove - Remove a device from the PM core's list of active devices. * @dev: Device to be removed from the list. */ void device_pm_remove(struct device *dev) { if (device_pm_not_required(dev)) return; pr_debug("Removing info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); complete_all(&dev->power.completion); mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); dev->power.in_dpm_list = false; mutex_unlock(&dpm_list_mtx); device_wakeup_disable(dev); pm_runtime_remove(dev); device_pm_check_callbacks(dev); } /** * device_pm_move_before - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come before. */ void device_pm_move_before(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s before %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert before devb. */ list_move_tail(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_after - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come after. */ void device_pm_move_after(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s after %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert after devb. */ list_move(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_last - Move device to end of the PM core's list of devices. * @dev: Device to move in dpm_list. */ void device_pm_move_last(struct device *dev) { pr_debug("Moving %s:%s to end of list\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); list_move_tail(&dev->power.entry, &dpm_list); } static ktime_t initcall_debug_start(struct device *dev, void *cb) { if (!pm_print_times_enabled) return 0; dev_info(dev, "calling %ps @ %i, parent: %s\n", cb, task_pid_nr(current), dev->parent ? dev_name(dev->parent) : "none"); return ktime_get(); } static void initcall_debug_report(struct device *dev, ktime_t calltime, void *cb, int error) { ktime_t rettime; if (!pm_print_times_enabled) return; rettime = ktime_get(); dev_info(dev, "%ps returned %d after %Ld usecs\n", cb, error, (unsigned long long)ktime_us_delta(rettime, calltime)); } /** * dpm_wait - Wait for a PM operation to complete. * @dev: Device to wait for. * @async: If unset, wait only if the device's power.async_suspend flag is set. */ static void dpm_wait(struct device *dev, bool async) { if (!dev) return; if (async || (pm_async_enabled && dev->power.async_suspend)) wait_for_completion(&dev->power.completion); } static int dpm_wait_fn(struct device *dev, void *async_ptr) { dpm_wait(dev, *((bool *)async_ptr)); return 0; } static void dpm_wait_for_children(struct device *dev, bool async) { device_for_each_child(dev, &async, dpm_wait_fn); } static void dpm_wait_for_suppliers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * If the supplier goes away right after we've checked the link to it, * we'll wait for its completion to change the state, but that's fine, * because the only things that will block as a result are the SRCU * callbacks freeing the link objects for the links in the list we're * walking. */ dev_for_each_link_to_supplier(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT && !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->supplier, async); device_links_read_unlock(idx); } static bool dpm_wait_for_superior(struct device *dev, bool async) { struct device *parent; /* * If the device is resumed asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by reference * counting the parent once more unless the device has been deleted * already (in which case return right away). */ mutex_lock(&dpm_list_mtx); if (!device_pm_initialized(dev)) { mutex_unlock(&dpm_list_mtx); return false; } parent = get_device(dev->parent); mutex_unlock(&dpm_list_mtx); dpm_wait(parent, async); put_device(parent); dpm_wait_for_suppliers(dev, async); /* * If the parent's callback has deleted the device, attempting to resume * it would be invalid, so avoid doing that then. */ return device_pm_initialized(dev); } static void dpm_wait_for_consumers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * The status of a device link can only be changed from "dormant" by a * probe, but that cannot happen during system suspend/resume. In * theory it can change to "dormant" at that time, but then it is * reasonable to wait for the target device anyway (eg. if it goes * away, it's better to wait for it to go away completely and then * continue instead of trying to continue in parallel with its * unregistration). */ dev_for_each_link_to_consumer(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT && !device_link_flag_is_sync_state_only(link->flags)) dpm_wait(link->consumer, async); device_links_read_unlock(idx); } static void dpm_wait_for_subordinate(struct device *dev, bool async) { dpm_wait_for_children(dev, async); dpm_wait_for_consumers(dev, async); } /** * pm_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend; case PM_EVENT_RESUME: return ops->resume; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw; case PM_EVENT_RESTORE: return ops->restore; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_late_early_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * Runtime PM is disabled for @dev while this function is being executed. */ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_late; case PM_EVENT_RESUME: return ops->resume_early; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_early; case PM_EVENT_RESTORE: return ops->restore_early; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_noirq; case PM_EVENT_RESUME: return ops->resume_noirq; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; case PM_EVENT_POWEROFF: case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_noirq; case PM_EVENT_RESTORE: return ops->restore_noirq; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info) { dev_dbg(dev, "%s%s%s driver flags: %x\n", info, pm_verb(state.event), ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ? ", may wakeup" : "", dev->power.driver_flags); } static void pm_dev_err(struct device *dev, pm_message_t state, const char *info, int error) { dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info, error); } static void dpm_show_time(ktime_t starttime, pm_message_t state, int error, const char *info) { ktime_t calltime; u64 usecs64; int usecs; calltime = ktime_get(); usecs64 = ktime_to_ns(ktime_sub(calltime, starttime)); do_div(usecs64, NSEC_PER_USEC); usecs = usecs64; if (usecs == 0) usecs = 1; pm_pr_dbg("%s%s%s of devices %s after %ld.%03ld msecs\n", info ?: "", info ? " " : "", pm_verb(state.event), error ? "aborted" : "complete", usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC); } static int dpm_run_callback(pm_callback_t cb, struct device *dev, pm_message_t state, const char *info) { ktime_t calltime; int error; if (!cb) return 0; calltime = initcall_debug_start(dev, cb); pm_dev_dbg(dev, state, info); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } #ifdef CONFIG_DPM_WATCHDOG struct dpm_watchdog { struct device *dev; struct task_struct *tsk; struct timer_list timer; bool fatal; }; #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \ struct dpm_watchdog wd static bool __read_mostly dpm_watchdog_all_cpu_backtrace; module_param(dpm_watchdog_all_cpu_backtrace, bool, 0644); MODULE_PARM_DESC(dpm_watchdog_all_cpu_backtrace, "Backtrace all CPUs on DPM watchdog timeout"); /** * dpm_watchdog_handler - Driver suspend / resume watchdog handler. * @t: The timer that PM watchdog depends on. * * Called when a driver has timed out suspending or resuming. * There's not much we can do here to recover so panic() to * capture a crash-dump in pstore. */ static void dpm_watchdog_handler(struct timer_list *t) { struct dpm_watchdog *wd = timer_container_of(wd, t, timer); struct timer_list *timer = &wd->timer; unsigned int time_left; if (wd->fatal) { unsigned int this_cpu = smp_processor_id(); dev_emerg(wd->dev, "**** DPM device timeout ****\n"); show_stack(wd->tsk, NULL, KERN_EMERG); if (dpm_watchdog_all_cpu_backtrace) trigger_allbutcpu_cpu_backtrace(this_cpu); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } time_left = CONFIG_DPM_WATCHDOG_TIMEOUT - CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; dev_warn(wd->dev, "**** DPM device timeout after %u seconds; %u seconds until panic ****\n", CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT, time_left); show_stack(wd->tsk, NULL, KERN_WARNING); wd->fatal = true; mod_timer(timer, jiffies + HZ * time_left); } /** * dpm_watchdog_set - Enable pm watchdog for given device. * @wd: Watchdog. Must be allocated on the stack. * @dev: Device to handle. */ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) { struct timer_list *timer = &wd->timer; wd->dev = dev; wd->tsk = current; wd->fatal = CONFIG_DPM_WATCHDOG_TIMEOUT == CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; timer_setup_on_stack(timer, dpm_watchdog_handler, 0); /* use same timeout value for both suspend and resume */ timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_WARNING_TIMEOUT; add_timer(timer); } /** * dpm_watchdog_clear - Disable suspend/resume watchdog. * @wd: Watchdog to disable. */ static void dpm_watchdog_clear(struct dpm_watchdog *wd) { struct timer_list *timer = &wd->timer; timer_delete_sync(timer); timer_destroy_on_stack(timer); } #else #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) #define dpm_watchdog_set(x, y) #define dpm_watchdog_clear(x) #endif /*------------------------- Resume routines -------------------------*/ /** * dev_pm_skip_resume - System-wide device resume optimization check. * @dev: Target device. * * Return: * - %false if the transition under way is RESTORE. * - Return value of dev_pm_skip_suspend() if the transition under way is THAW. * - The logical negation of %power.must_resume otherwise (that is, when the * transition under way is RESUME). */ bool dev_pm_skip_resume(struct device *dev) { if (pm_transition.event == PM_EVENT_RESTORE) return false; if (pm_transition.event == PM_EVENT_THAW) return dev_pm_skip_suspend(dev); return !dev->power.must_resume; } static bool is_async(struct device *dev) { return dev->power.async_suspend && pm_async_enabled && !pm_trace_is_enabled(); } static bool __dpm_async(struct device *dev, async_func_t func) { if (dev->power.work_in_progress) return true; if (!is_async(dev)) return false; dev->power.work_in_progress = true; get_device(dev); if (async_schedule_dev_nocall(func, dev)) return true; put_device(dev); return false; } static bool dpm_async_fn(struct device *dev, async_func_t func) { guard(mutex)(&async_wip_mtx); return __dpm_async(dev, func); } static int dpm_async_with_cleanup(struct device *dev, void *fn) { guard(mutex)(&async_wip_mtx); if (!__dpm_async(dev, fn)) dev->power.work_in_progress = false; return 0; } static void dpm_async_resume_children(struct device *dev, async_func_t func) { /* * Prevent racing with dpm_clear_async_state() during initial list * walks in dpm_noirq_resume_devices(), dpm_resume_early(), and * dpm_resume(). */ guard(mutex)(&dpm_list_mtx); /* * Start processing "async" children of the device unless it's been * started already for them. */ device_for_each_child(dev, func, dpm_async_with_cleanup); } static void dpm_async_resume_subordinate(struct device *dev, async_func_t func) { struct device_link *link; int idx; dpm_async_resume_children(dev, func); idx = device_links_read_lock(); /* Start processing the device's "async" consumers. */ dev_for_each_link_to_consumer(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->consumer, func); device_links_read_unlock(idx); } static void dpm_clear_async_state(struct device *dev) { reinit_completion(&dev->power.completion); dev->power.work_in_progress = false; } static bool dpm_root_device(struct device *dev) { lockdep_assert_held(&dpm_list_mtx); /* * Since this function is required to run under dpm_list_mtx, the * list_empty() below will only return true if the device's list of * consumers is actually empty before calling it. */ return !dev->parent && list_empty(&dev->links.suppliers); } static void async_resume_noirq(void *data, async_cookie_t cookie); /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_resume_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; bool skip_resume; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_noirq_suspended) { /* * This means that system suspend has been aborted in the noirq * phase before invoking the noirq suspend callback for the * device, so if device_suspend_late() has left it in suspend, * device_resume_early() should leave it in suspend either in * case the early resume of it depends on the noirq resume that * has not run. */ if (dev_pm_skip_suspend(dev)) dev->power.must_resume = false; goto Out; } if (!dpm_wait_for_superior(dev, async)) goto Out; skip_resume = dev_pm_skip_resume(dev); /* * If the driver callback is skipped below or by the middle layer * callback and device_resume_early() also skips the driver callback for * this device later, it needs to appear as "suspended" to PM-runtime, * so change its status accordingly. * * Otherwise, the device is going to be resumed, so set its PM-runtime * status to "active" unless its power.smart_suspend flag is clear, in * which case it is not necessary to update its PM-runtime status. */ if (skip_resume) pm_runtime_set_suspended(dev); else if (dev_pm_smart_suspend(dev)) pm_runtime_set_active(dev); if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (skip_resume) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_noirq_suspended = false; Out: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } dpm_async_resume_subordinate(dev, async_resume_noirq); } static void async_resume_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_noirq(dev, pm_transition, true); put_device(dev); } static void dpm_noirq_resume_devices(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Start processing "async" root devices upfront so they don't wait for * the "sync" devices they don't depend on. */ list_for_each_entry(dev, &dpm_noirq_list, power.entry) { dpm_clear_async_state(dev); if (dpm_root_device(dev)) dpm_async_with_cleanup(dev, async_resume_noirq); } while (!list_empty(&dpm_noirq_list)) { dev = to_device(dpm_noirq_list.next); list_move_tail(&dev->power.entry, &dpm_late_early_list); if (!dpm_async_fn(dev, async_resume_noirq)) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "noirq"); if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * * Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and * allow device drivers' interrupt handlers to be called. */ void dpm_resume_noirq(pm_message_t state) { dpm_noirq_resume_devices(state); resume_device_irqs(); device_wakeup_disarm_wake_irqs(); } static void async_resume_early(void *data, async_cookie_t cookie); /** * device_resume_early - Execute an "early resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_resume_early(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) goto Out; if (dev->power.syscore) goto Skip; if (!dpm_wait_for_superior(dev, async)) goto Out; if (dev->pm_domain) { info = "early power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "early type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "early class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "early bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_resume(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "early driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_late_suspended = false; pm_runtime_enable(dev); Out: TRACE_RESUME(error); complete_all(&dev->power.completion); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); } dpm_async_resume_subordinate(dev, async_resume_early); } static void async_resume_early(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_early(dev, pm_transition, true); put_device(dev); } /** * dpm_resume_early - Execute "early resume" callbacks for all devices. * @state: PM transition of the system being carried out. */ void dpm_resume_early(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Start processing "async" root devices upfront so they don't wait for * the "sync" devices they don't depend on. */ list_for_each_entry(dev, &dpm_late_early_list, power.entry) { dpm_clear_async_state(dev); if (dpm_root_device(dev)) dpm_async_with_cleanup(dev, async_resume_early); } while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.next); list_move_tail(&dev->power.entry, &dpm_suspended_list); if (!dpm_async_fn(dev, async_resume_early)) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_early(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "early"); if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME_EARLY); trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** * dpm_resume_start - Execute "noirq" and "early" device callbacks. * @state: PM transition of the system being carried out. */ void dpm_resume_start(pm_message_t state) { dpm_resume_noirq(state); dpm_resume_early(state); } EXPORT_SYMBOL_GPL(dpm_resume_start); static void async_resume(void *data, async_cookie_t cookie); /** * device_resume - Execute "resume" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. */ static void device_resume(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore) goto Complete; if (!dev->power.is_suspended) goto Complete; dev->power.is_suspended = false; if (dev->power.direct_complete) { /* * Allow new children to be added under the device after this * point if it has no PM callbacks. */ if (dev->power.no_pm_callbacks) dev->power.is_prepared = false; /* Match the pm_runtime_disable() in device_suspend(). */ pm_runtime_enable(dev); goto Complete; } if (!dpm_wait_for_superior(dev, async)) goto Complete; dpm_watchdog_set(&wd, dev); device_lock(dev); /* * This is a fib. But we'll allow new children to be added below * a resumed device, even if the device hasn't been completed yet. */ dev->power.is_prepared = false; if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Driver; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Driver; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Driver; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->resume) { info = "legacy bus "; callback = dev->bus->resume; goto End; } } Driver: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } End: error = dpm_run_callback(callback, dev, state, info); device_unlock(dev); dpm_watchdog_clear(&wd); Complete: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } dpm_async_resume_subordinate(dev, async_resume); } static void async_resume(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume(dev, pm_transition, true); put_device(dev); } /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the appropriate "resume" callback for all devices whose status * indicates that they are suspended. */ void dpm_resume(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Start processing "async" root devices upfront so they don't wait for * the "sync" devices they don't depend on. */ list_for_each_entry(dev, &dpm_suspended_list, power.entry) { dpm_clear_async_state(dev); if (dpm_root_device(dev)) dpm_async_with_cleanup(dev, async_resume); } while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); list_move_tail(&dev->power.entry, &dpm_prepared_list); if (!dpm_async_fn(dev, async_resume)) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, NULL); if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME); cpufreq_resume(); devfreq_resume(); trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** * device_complete - Complete a PM transition for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. */ static void device_complete(struct device *dev, pm_message_t state) { void (*callback)(struct device *) = NULL; const char *info = NULL; if (dev->power.syscore) goto out; device_lock(dev); if (dev->pm_domain) { info = "completing power domain "; callback = dev->pm_domain->ops.complete; } else if (dev->type && dev->type->pm) { info = "completing type "; callback = dev->type->pm->complete; } else if (dev->class && dev->class->pm) { info = "completing class "; callback = dev->class->pm->complete; } else if (dev->bus && dev->bus->pm) { info = "completing bus "; callback = dev->bus->pm->complete; } if (!callback && dev->driver && dev->driver->pm) { info = "completing driver "; callback = dev->driver->pm->complete; } if (callback) { pm_dev_dbg(dev, state, info); callback(dev); } device_unlock(dev); out: /* If enabling runtime PM for the device is blocked, unblock it. */ pm_runtime_unblock(dev); pm_runtime_put(dev); } /** * dpm_complete - Complete a PM transition for all non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the ->complete() callbacks for all devices whose PM status is not * DPM_ON (this allows new devices to be registered). */ void dpm_complete(pm_message_t state) { struct list_head list; trace_suspend_resume(TPS("dpm_complete"), state.event, true); INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); get_device(dev); dev->power.is_prepared = false; list_move(&dev->power.entry, &list); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); device_complete(dev, state); trace_device_pm_callback_end(dev, 0); put_device(dev); mutex_lock(&dpm_list_mtx); } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); /* Allow device probing and trigger re-probing of deferred devices */ device_unblock_probing(); trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** * dpm_resume_end - Execute "resume" callbacks and complete system transition. * @state: PM transition of the system being carried out. * * Execute "resume" callbacks for all devices and complete the PM transition of * the system. */ void dpm_resume_end(pm_message_t state) { dpm_resume(state); pm_restore_gfp_mask(); dpm_complete(state); } EXPORT_SYMBOL_GPL(dpm_resume_end); /*------------------------- Suspend routines -------------------------*/ static bool dpm_leaf_device(struct device *dev) { struct device *child; lockdep_assert_held(&dpm_list_mtx); child = device_find_any_child(dev); if (child) { put_device(child); return false; } /* * Since this function is required to run under dpm_list_mtx, the * list_empty() below will only return true if the device's list of * consumers is actually empty before calling it. */ return list_empty(&dev->links.consumers); } static bool dpm_async_suspend_parent(struct device *dev, async_func_t func) { guard(mutex)(&dpm_list_mtx); /* * If the device is suspended asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by checking * if the device has been deleted already as the parent cannot be * deleted before it. */ if (!device_pm_initialized(dev)) return false; /* Start processing the device's parent if it is "async". */ if (dev->parent) dpm_async_with_cleanup(dev->parent, func); return true; } static void dpm_async_suspend_superior(struct device *dev, async_func_t func) { struct device_link *link; int idx; if (!dpm_async_suspend_parent(dev, func)) return; idx = device_links_read_lock(); /* Start processing the device's "async" suppliers. */ dev_for_each_link_to_supplier(link, dev) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_async_with_cleanup(link->supplier, func); device_links_read_unlock(idx); } static void dpm_async_suspend_complete_all(struct list_head *device_list) { struct device *dev; guard(mutex)(&async_wip_mtx); list_for_each_entry_reverse(dev, device_list, power.entry) { /* * In case the device is being waited for and async processing * has not started for it yet, let the waiters make progress. */ if (!dev->power.work_in_progress) complete_all(&dev->power.completion); } } /** * resume_event - Return a "resume" message for given "suspend" sleep state. * @sleep_state: PM message representing a sleep state. * * Return a PM message representing the resume event corresponding to given * sleep state. */ static pm_message_t resume_event(pm_message_t sleep_state) { switch (sleep_state.event) { case PM_EVENT_SUSPEND: return PMSG_RESUME; case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return PMSG_RECOVER; case PM_EVENT_HIBERNATE: return PMSG_RESTORE; } return PMSG_ON; } static void dpm_superior_set_must_resume(struct device *dev) { struct device_link *link; int idx; if (dev->parent) dev->parent->power.must_resume = true; idx = device_links_read_lock(); dev_for_each_link_to_supplier(link, dev) link->supplier->power.must_resume = true; device_links_read_unlock(idx); } static void async_suspend_noirq(void *data, async_cookie_t cookie); /** * device_suspend_noirq - Execute a "noirq suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_suspend_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) goto Complete; if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); goto Complete; } Skip: dev->power.is_noirq_suspended = true; /* * Devices must be resumed unless they are explicitly allowed to be left * in suspend, but even in that case skipping the resume of devices that * were in use right before the system suspend (as indicated by their * runtime PM usage counters and child counters) would be suboptimal. */ if (!(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && dev->power.may_skip_resume) || !pm_runtime_need_not_resume(dev)) dev->power.must_resume = true; if (dev->power.must_resume) dpm_superior_set_must_resume(dev); Complete: complete_all(&dev->power.completion); TRACE_SUSPEND(error); if (error || READ_ONCE(async_error)) return; dpm_async_suspend_superior(dev, async_suspend_noirq); } static void async_suspend_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_noirq(dev, pm_transition, true); put_device(dev); } static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; int error; trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Start processing "async" leaf devices upfront so they don't need to * wait for the "sync" devices they don't depend on. */ list_for_each_entry_reverse(dev, &dpm_late_early_list, power.entry) { dpm_clear_async_state(dev); if (dpm_leaf_device(dev)) dpm_async_with_cleanup(dev, async_suspend_noirq); } while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.prev); list_move(&dev->power.entry, &dpm_noirq_list); if (dpm_async_fn(dev, async_suspend_noirq)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); device_suspend_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_late_early_list); /* * Move all devices to the target list to resume them * properly. */ list_splice_init(&dpm_late_early_list, &dpm_noirq_list); break; } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); error = READ_ONCE(async_error); if (error) dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_show_time(starttime, state, error, "noirq"); trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } /** * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers' interrupt handlers from being called and invoke * "noirq" suspend callbacks for all non-sysdev devices. */ int dpm_suspend_noirq(pm_message_t state) { int ret; device_wakeup_arm_wake_irqs(); suspend_device_irqs(); ret = dpm_noirq_suspend_devices(state); if (ret) dpm_resume_noirq(resume_event(state)); return ret; } static void dpm_propagate_wakeup_to_parent(struct device *dev) { struct device *parent = dev->parent; if (!parent) return; spin_lock_irq(&parent->power.lock); if (device_wakeup_path(dev) && !parent->power.ignore_children) parent->power.wakeup_path = true; spin_unlock_irq(&parent->power.lock); } static void async_suspend_late(void *data, async_cookie_t cookie); /** * device_suspend_late - Execute a "late suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_suspend_late(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) goto Complete; if (pm_wakeup_pending()) { WRITE_ONCE(async_error, -EBUSY); goto Complete; } if (dev->power.direct_complete) goto Complete; /* * Disable runtime PM for the device without checking if there is a * pending resume request for it. */ __pm_runtime_disable(dev, false); if (dev->power.syscore) goto Skip; if (dev->pm_domain) { info = "late power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "late type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "late class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "late bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "late driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); pm_runtime_enable(dev); goto Complete; } dpm_propagate_wakeup_to_parent(dev); Skip: dev->power.is_late_suspended = true; Complete: TRACE_SUSPEND(error); complete_all(&dev->power.completion); if (error || READ_ONCE(async_error)) return; dpm_async_suspend_superior(dev, async_suspend_late); } static void async_suspend_late(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_late(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend_late - Execute "late suspend" callbacks for all devices. * @state: PM transition of the system being carried out. */ int dpm_suspend_late(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; int error; trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); pm_transition = state; async_error = 0; wake_up_all_idle_cpus(); mutex_lock(&dpm_list_mtx); /* * Start processing "async" leaf devices upfront so they don't need to * wait for the "sync" devices they don't depend on. */ list_for_each_entry_reverse(dev, &dpm_suspended_list, power.entry) { dpm_clear_async_state(dev); if (dpm_leaf_device(dev)) dpm_async_with_cleanup(dev, async_suspend_late); } while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.prev); list_move(&dev->power.entry, &dpm_late_early_list); if (dpm_async_fn(dev, async_suspend_late)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); device_suspend_late(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_suspended_list); /* * Move all devices to the target list to resume them * properly. */ list_splice_init(&dpm_suspended_list, &dpm_late_early_list); break; } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); error = READ_ONCE(async_error); if (error) { dpm_save_failed_step(SUSPEND_SUSPEND_LATE); dpm_resume_early(resume_event(state)); } dpm_show_time(starttime, state, error, "late"); trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } /** * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. * @state: PM transition of the system being carried out. */ int dpm_suspend_end(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_suspend_late(state); if (error) goto out; error = dpm_suspend_noirq(state); if (error) dpm_resume_early(resume_event(state)); out: dpm_show_time(starttime, state, error, "end"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. * @dev: Device to suspend. * @state: PM transition of the system being carried out. * @cb: Suspend callback to execute. * @info: string description of caller. */ static int legacy_suspend(struct device *dev, pm_message_t state, int (*cb)(struct device *dev, pm_message_t state), const char *info) { int error; ktime_t calltime; calltime = initcall_debug_start(dev, cb); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } static void dpm_clear_superiors_direct_complete(struct device *dev) { struct device_link *link; int idx; if (dev->parent) { spin_lock_irq(&dev->parent->power.lock); dev->parent->power.direct_complete = false; spin_unlock_irq(&dev->parent->power.lock); } idx = device_links_read_lock(); dev_for_each_link_to_supplier(link, dev) { spin_lock_irq(&link->supplier->power.lock); link->supplier->power.direct_complete = false; spin_unlock_irq(&link->supplier->power.lock); } device_links_read_unlock(idx); } static void async_suspend(void *data, async_cookie_t cookie); /** * device_suspend - Execute "suspend" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. */ static void device_suspend(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (READ_ONCE(async_error)) { dev->power.direct_complete = false; goto Complete; } /* * Wait for possible runtime PM transitions of the device in progress * to complete and if there's a runtime resume request pending for it, * resume it before proceeding with invoking the system-wide suspend * callbacks for it. * * If the system-wide suspend callbacks below change the configuration * of the device, they must disable runtime PM for it or otherwise * ensure that its runtime-resume callbacks will not be confused by that * change in case they are invoked going forward. */ pm_runtime_barrier(dev); if (pm_wakeup_pending()) { dev->power.direct_complete = false; WRITE_ONCE(async_error, -EBUSY); goto Complete; } if (dev->power.syscore) goto Complete; /* Avoid direct_complete to let wakeup_path propagate. */ if (device_may_wakeup(dev) || device_wakeup_path(dev)) dev->power.direct_complete = false; if (dev->power.direct_complete) { if (pm_runtime_status_suspended(dev)) { pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) { pm_dev_dbg(dev, state, "direct-complete "); dev->power.is_suspended = true; goto Complete; } pm_runtime_enable(dev); } dev->power.direct_complete = false; } dev->power.may_skip_resume = true; dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME); dpm_watchdog_set(&wd, dev); device_lock(dev); if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Run; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Run; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Run; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->suspend) { pm_dev_dbg(dev, state, "legacy bus "); error = legacy_suspend(dev, state, dev->bus->suspend, "legacy bus "); goto End; } } Run: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } error = dpm_run_callback(callback, dev, state, info); End: if (!error) { dev->power.is_suspended = true; if (device_may_wakeup(dev)) dev->power.wakeup_path = true; dpm_propagate_wakeup_to_parent(dev); dpm_clear_superiors_direct_complete(dev); } device_unlock(dev); dpm_watchdog_clear(&wd); Complete: if (error) { WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } complete_all(&dev->power.completion); TRACE_SUSPEND(error); if (error || READ_ONCE(async_error)) return; dpm_async_suspend_superior(dev, async_suspend); } static void async_suspend(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices. * @state: PM transition of the system being carried out. */ int dpm_suspend(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; int error; trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); devfreq_suspend(); cpufreq_suspend(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Start processing "async" leaf devices upfront so they don't need to * wait for the "sync" devices they don't depend on. */ list_for_each_entry_reverse(dev, &dpm_prepared_list, power.entry) { dpm_clear_async_state(dev); if (dpm_leaf_device(dev)) dpm_async_with_cleanup(dev, async_suspend); } while (!list_empty(&dpm_prepared_list)) { dev = to_device(dpm_prepared_list.prev); list_move(&dev->power.entry, &dpm_suspended_list); if (dpm_async_fn(dev, async_suspend)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); device_suspend(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_prepared_list); /* * Move all devices to the target list to resume them * properly. */ list_splice_init(&dpm_prepared_list, &dpm_suspended_list); break; } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); error = READ_ONCE(async_error); if (error) dpm_save_failed_step(SUSPEND_SUSPEND); dpm_show_time(starttime, state, error, NULL); trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } static bool device_prepare_smart_suspend(struct device *dev) { struct device_link *link; bool ret = true; int idx; /* * The "smart suspend" feature is enabled for devices whose drivers ask * for it and for devices without PM callbacks. * * However, if "smart suspend" is not enabled for the device's parent * or any of its suppliers that take runtime PM into account, it cannot * be enabled for the device either. */ if (!dev->power.no_pm_callbacks && !dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) return false; if (dev->parent && !dev_pm_smart_suspend(dev->parent) && !dev->parent->power.ignore_children && !pm_runtime_blocked(dev->parent)) return false; idx = device_links_read_lock(); dev_for_each_link_to_supplier(link, dev) { if (!device_link_test(link, DL_FLAG_PM_RUNTIME)) continue; if (!dev_pm_smart_suspend(link->supplier) && !pm_runtime_blocked(link->supplier)) { ret = false; break; } } device_links_read_unlock(idx); return ret; } /** * device_prepare - Prepare a device for system power transition. * @dev: Device to handle. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for given device. No new children of the * device may be registered after this function has returned. */ static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; bool smart_suspend; int ret = 0; /* * If a device's parent goes into runtime suspend at the wrong time, * it won't be possible to resume the device. To prevent this we * block runtime suspend here, during the prepare phase, and allow * it again during the complete phase. */ pm_runtime_get_noresume(dev); /* * If runtime PM is disabled for the device at this point and it has * never been enabled so far, it should not be enabled until this system * suspend-resume cycle is complete, so prepare to trigger a warning on * subsequent attempts to enable it. */ smart_suspend = !pm_runtime_block_if_disabled(dev); if (dev->power.syscore) return 0; device_lock(dev); dev->power.wakeup_path = false; dev->power.out_band_wakeup = false; if (dev->power.no_pm_callbacks) goto unlock; if (dev->pm_domain) callback = dev->pm_domain->ops.prepare; else if (dev->type && dev->type->pm) callback = dev->type->pm->prepare; else if (dev->class && dev->class->pm) callback = dev->class->pm->prepare; else if (dev->bus && dev->bus->pm) callback = dev->bus->pm->prepare; if (!callback && dev->driver && dev->driver->pm) callback = dev->driver->pm->prepare; if (callback) ret = callback(dev); unlock: device_unlock(dev); if (ret < 0) { suspend_report_result(dev, callback, ret); pm_runtime_put(dev); return ret; } /* Do not enable "smart suspend" for devices with disabled runtime PM. */ if (smart_suspend) smart_suspend = device_prepare_smart_suspend(dev); spin_lock_irq(&dev->power.lock); dev->power.smart_suspend = smart_suspend; /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is * runtime-suspended, you can leave it in that state provided that you * will do the same thing with all of its descendants". This only * applies to suspend transitions, however. */ dev->power.direct_complete = state.event == PM_EVENT_SUSPEND && (ret > 0 || dev->power.no_pm_callbacks) && !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); spin_unlock_irq(&dev->power.lock); return 0; } /** * dpm_prepare - Prepare all non-sysdev devices for a system PM transition. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for all devices. */ int dpm_prepare(pm_message_t state) { int error = 0; trace_suspend_resume(TPS("dpm_prepare"), state.event, true); /* * Give a chance for the known devices to complete their probes, before * disable probing of devices. This sync point is important at least * at boot time + hibernation restore. */ wait_for_device_probe(); /* * It is unsafe if probing of devices will happen during suspend or * hibernation and system behavior will be unpredictable in this case. * So, let's prohibit device's probing here and defer their probes * instead. The normal behavior will be restored in dpm_complete(). */ device_block_probing(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_list) && !error) { struct device *dev = to_device(dpm_list.next); get_device(dev); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); error = device_prepare(dev, state); trace_device_pm_callback_end(dev, error); mutex_lock(&dpm_list_mtx); if (!error) { dev->power.is_prepared = true; if (!list_empty(&dev->power.entry)) list_move_tail(&dev->power.entry, &dpm_prepared_list); } else if (error == -EAGAIN) { error = 0; } else { dev_info(dev, "not prepared for power transition: code %d\n", error); } mutex_unlock(&dpm_list_mtx); put_device(dev); mutex_lock(&dpm_list_mtx); } mutex_unlock(&dpm_list_mtx); trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } /** * dpm_suspend_start - Prepare devices for PM transition and suspend them. * @state: PM transition of the system being carried out. * * Prepare all non-sysdev devices for system PM transition and execute "suspend" * callbacks for them. */ int dpm_suspend_start(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_prepare(state); if (error) dpm_save_failed_step(SUSPEND_PREPARE); else { pm_restrict_gfp_mask(); error = dpm_suspend(state); } dpm_show_time(starttime, state, error, "start"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) { if (ret) dev_err(dev, "%s(): %ps returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); /** * device_pm_wait_for_dev - Wait for suspend/resume of a device to complete. * @subordinate: Device that needs to wait for @dev. * @dev: Device to wait for. */ int device_pm_wait_for_dev(struct device *subordinate, struct device *dev) { dpm_wait(dev, subordinate->power.async_suspend); return async_error; } EXPORT_SYMBOL_GPL(device_pm_wait_for_dev); /** * dpm_for_each_dev - device iterator. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over devices in dpm_list, and call @fn for each device, * passing it @data. */ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *)) { struct device *dev; if (!fn) return; device_pm_lock(); list_for_each_entry(dev, &dpm_list, power.entry) fn(dev, data); device_pm_unlock(); } EXPORT_SYMBOL_GPL(dpm_for_each_dev); static bool pm_ops_is_empty(const struct dev_pm_ops *ops) { if (!ops) return true; return !ops->prepare && !ops->suspend && !ops->suspend_late && !ops->suspend_noirq && !ops->resume_noirq && !ops->resume_early && !ops->resume && !ops->complete; } void device_pm_check_callbacks(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); dev->power.no_pm_callbacks = (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && !dev->bus->suspend && !dev->bus->resume)) && (!dev->class || pm_ops_is_empty(dev->class->pm)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irqrestore(&dev->power.lock, flags); } bool dev_pm_skip_suspend(struct device *dev) { return dev_pm_smart_suspend(dev) && pm_runtime_status_suspended(dev); }
2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 // SPDX-License-Identifier: GPL-2.0 #include <linux/cred.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/quotaops.h> #include <linux/sched.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/genetlink.h> static const struct genl_multicast_group quota_mcgrps[] = { { .name = "events", }, }; /* Netlink family structure for quota */ static struct genl_family quota_genl_family __ro_after_init = { .module = THIS_MODULE, .hdrsize = 0, .name = "VFS_DQUOT", .version = 1, .maxattr = QUOTA_NL_A_MAX, .mcgrps = quota_mcgrps, .n_mcgrps = ARRAY_SIZE(quota_mcgrps), }; /** * quota_send_warning - Send warning to userspace about exceeded quota * @qid: The kernel internal quota identifier. * @dev: The device on which the fs is mounted (sb->s_dev) * @warntype: The type of the warning: QUOTA_NL_... * * This can be used by filesystems (including those which don't use * dquot) to send a message to userspace relating to quota limits. * */ void quota_send_warning(struct kqid qid, dev_t dev, const char warntype) { static atomic_t seq; struct sk_buff *skb; void *msg_head; int ret; int msg_size = 4 * nla_total_size(sizeof(u32)) + 2 * nla_total_size_64bit(sizeof(u64)); /* We have to allocate using GFP_NOFS as we are called from a * filesystem performing write and thus further recursion into * the fs to free some data could cause deadlocks. */ skb = genlmsg_new(msg_size, GFP_NOFS); if (!skb) { printk(KERN_ERR "VFS: Not enough memory to send quota warning.\n"); return; } msg_head = genlmsg_put(skb, 0, atomic_add_return(1, &seq), &quota_genl_family, 0, QUOTA_NL_C_WARNING); if (!msg_head) { printk(KERN_ERR "VFS: Cannot store netlink header in quota warning.\n"); goto err_out; } ret = nla_put_u32(skb, QUOTA_NL_A_QTYPE, qid.type); if (ret) goto attr_err_out; ret = nla_put_u64_64bit(skb, QUOTA_NL_A_EXCESS_ID, from_kqid_munged(&init_user_ns, qid), QUOTA_NL_A_PAD); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_WARNING, warntype); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MAJOR, MAJOR(dev)); if (ret) goto attr_err_out; ret = nla_put_u32(skb, QUOTA_NL_A_DEV_MINOR, MINOR(dev)); if (ret) goto attr_err_out; ret = nla_put_u64_64bit(skb, QUOTA_NL_A_CAUSED_ID, from_kuid_munged(&init_user_ns, current_uid()), QUOTA_NL_A_PAD); if (ret) goto attr_err_out; genlmsg_end(skb, msg_head); genlmsg_multicast(&quota_genl_family, skb, 0, 0, GFP_NOFS); return; attr_err_out: printk(KERN_ERR "VFS: Not enough space to compose quota message!\n"); err_out: kfree_skb(skb); } EXPORT_SYMBOL(quota_send_warning); static int __init quota_init(void) { if (genl_register_family(&quota_genl_family) != 0) printk(KERN_ERR "VFS: Failed to create quota netlink interface.\n"); return 0; }; fs_initcall(quota_init);
4364 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IRQDESC_H #define _LINUX_IRQDESC_H #include <linux/rcupdate.h> #include <linux/kobject.h> #include <linux/mutex.h> /* * Core internal functions to deal with irq descriptors */ struct irq_affinity_notify; struct proc_dir_entry; struct module; struct irq_desc; struct irq_domain; struct pt_regs; /** * struct irqstat - interrupt statistics * @cnt: real-time interrupt count * @ref: snapshot of interrupt count */ struct irqstat { unsigned int cnt; #ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT unsigned int ref; #endif }; /** * struct irq_desc - interrupt descriptor * @irq_common_data: per irq and chip data passed down to chip functions * @kstat_irqs: irq stats per cpu * @handle_irq: highlevel irq-events handler * @action: the irq action chain * @status_use_accessors: status information * @core_internal_state__do_not_mess_with_it: core internal status information * @depth: disable-depth, for nested irq_disable() calls * @wake_depth: enable depth, for multiple irq_set_irq_wake() callers * @tot_count: stats field for non-percpu irqs * @irq_count: stats field to detect stalled irqs * @last_unhandled: aging timer for unhandled count * @irqs_unhandled: stats field for spurious unhandled interrupts * @threads_handled: stats field for deferred spurious detection of threaded handlers * @threads_handled_last: comparator field for deferred spurious detection of threaded handlers * @lock: locking for SMP * @affinity_hint: hint to user space for preferred irq affinity * @affinity_notify: context for notification of affinity changes * @pending_mask: pending rebalanced interrupts * @threads_oneshot: bitfield to handle shared oneshot threads * @threads_active: number of irqaction threads currently running * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers * @nr_actions: number of installed actions on this descriptor * @no_suspend_depth: number of irqactions on a irq descriptor with * IRQF_NO_SUSPEND set * @force_resume_depth: number of irqactions on a irq descriptor with * IRQF_FORCE_RESUME set * @rcu: rcu head for delayed free * @kobj: kobject used to represent this struct in sysfs * @request_mutex: mutex to protect request/free before locking desc->lock * @dir: /proc/irq/ procfs entry * @debugfs_file: dentry for the debugfs file * @name: flow handler name for /proc/interrupts output */ struct irq_desc { struct irq_common_data irq_common_data; struct irq_data irq_data; struct irqstat __percpu *kstat_irqs; irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ unsigned int status_use_accessors; unsigned int core_internal_state__do_not_mess_with_it; unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ unsigned int tot_count; unsigned int irq_count; /* For detecting broken IRQs */ unsigned long last_unhandled; /* Aging timer for unhandled count */ unsigned int irqs_unhandled; atomic_t threads_handled; int threads_handled_last; raw_spinlock_t lock; struct cpumask *percpu_enabled; #ifdef CONFIG_SMP const struct cpumask *affinity_hint; struct irq_affinity_notify *affinity_notify; #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_var_t pending_mask; #endif #endif unsigned long threads_oneshot; atomic_t threads_active; wait_queue_head_t wait_for_threads; #ifdef CONFIG_PM_SLEEP unsigned int nr_actions; unsigned int no_suspend_depth; unsigned int cond_suspend_depth; unsigned int force_resume_depth; #endif #ifdef CONFIG_PROC_FS struct proc_dir_entry *dir; #endif #ifdef CONFIG_GENERIC_IRQ_DEBUGFS struct dentry *debugfs_file; const char *dev_name; #endif #ifdef CONFIG_SPARSE_IRQ struct rcu_head rcu; struct kobject kobj; #endif struct mutex request_mutex; int parent_irq; struct module *owner; const char *name; #ifdef CONFIG_HARDIRQS_SW_RESEND struct hlist_node resend_node; #endif } ____cacheline_internodealigned_in_smp; #ifdef CONFIG_SPARSE_IRQ extern void irq_lock_sparse(void); extern void irq_unlock_sparse(void); #else static inline void irq_lock_sparse(void) { } static inline void irq_unlock_sparse(void) { } extern struct irq_desc irq_desc[NR_IRQS]; #endif static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc, unsigned int cpu) { return desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } static inline struct irq_desc *irq_data_to_desc(struct irq_data *data) { return container_of(data->common, struct irq_desc, irq_common_data); } static inline unsigned int irq_desc_get_irq(struct irq_desc *desc) { return desc->irq_data.irq; } static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) { return &desc->irq_data; } static inline struct irq_chip *irq_desc_get_chip(struct irq_desc *desc) { return desc->irq_data.chip; } static inline void *irq_desc_get_chip_data(struct irq_desc *desc) { return desc->irq_data.chip_data; } static inline void *irq_desc_get_handler_data(struct irq_desc *desc) { return desc->irq_common_data.handler_data; } /* * Architectures call this to let the generic IRQ layer * handle an interrupt. */ static inline void generic_handle_irq_desc(struct irq_desc *desc) { desc->handle_irq(desc); } int handle_irq_desc(struct irq_desc *desc); int generic_handle_irq(unsigned int irq); int generic_handle_irq_safe(unsigned int irq); #ifdef CONFIG_IRQ_DOMAIN /* * Convert a HW interrupt number to a logical one using a IRQ domain, * and handle the result interrupt number. Return -EINVAL if * conversion failed. */ int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq); int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq); int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq); #endif /* Test to see if a driver has successfully requested an irq */ static inline int irq_desc_has_action(struct irq_desc *desc) { return desc && desc->action != NULL; } /** * irq_set_handler_locked - Set irq handler from a locked region * @data: Pointer to the irq_data structure which identifies the irq * @handler: Flow control handler function for this interrupt * * Sets the handler in the irq descriptor associated to @data. * * Must be called with irq_desc locked and valid parameters. Typical * call site is the irq_set_type() callback. */ static inline void irq_set_handler_locked(struct irq_data *data, irq_flow_handler_t handler) { struct irq_desc *desc = irq_data_to_desc(data); desc->handle_irq = handler; } /** * irq_set_chip_handler_name_locked - Set chip, handler and name from a locked region * @data: Pointer to the irq_data structure for which the chip is set * @chip: Pointer to the new irq chip * @handler: Flow control handler function for this interrupt * @name: Name of the interrupt * * Replace the irq chip at the proper hierarchy level in @data and * sets the handler and name in the associated irq descriptor. * * Must be called with irq_desc locked and valid parameters. */ static inline void irq_set_chip_handler_name_locked(struct irq_data *data, const struct irq_chip *chip, irq_flow_handler_t handler, const char *name) { struct irq_desc *desc = irq_data_to_desc(data); desc->handle_irq = handler; desc->name = name; data->chip = (struct irq_chip *)chip; } bool irq_check_status_bit(unsigned int irq, unsigned int bitmask); static inline bool irq_balancing_disabled(unsigned int irq) { return irq_check_status_bit(irq, IRQ_NO_BALANCING_MASK); } static inline bool irq_is_percpu(unsigned int irq) { return irq_check_status_bit(irq, IRQ_PER_CPU); } static inline bool irq_is_percpu_devid(unsigned int irq) { return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID); } void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class); static inline void irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class) { if (IS_ENABLED(CONFIG_LOCKDEP)) __irq_set_lockdep_class(irq, lock_class, request_class); } #endif
228 228 226 2 2 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock - Credential hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI * Copyright © 2024-2025 Microsoft Corporation */ #include <linux/binfmts.h> #include <linux/cred.h> #include <linux/lsm_hooks.h> #include "common.h" #include "cred.h" #include "ruleset.h" #include "setup.h" static void hook_cred_transfer(struct cred *const new, const struct cred *const old) { const struct landlock_cred_security *const old_llcred = landlock_cred(old); if (old_llcred->domain) { landlock_get_ruleset(old_llcred->domain); *landlock_cred(new) = *old_llcred; } } static int hook_cred_prepare(struct cred *const new, const struct cred *const old, const gfp_t gfp) { hook_cred_transfer(new, old); return 0; } static void hook_cred_free(struct cred *const cred) { struct landlock_ruleset *const dom = landlock_cred(cred)->domain; if (dom) landlock_put_ruleset_deferred(dom); } #ifdef CONFIG_AUDIT static int hook_bprm_creds_for_exec(struct linux_binprm *const bprm) { /* Resets for each execution. */ landlock_cred(bprm->cred)->domain_exec = 0; return 0; } #endif /* CONFIG_AUDIT */ static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, hook_cred_prepare), LSM_HOOK_INIT(cred_transfer, hook_cred_transfer), LSM_HOOK_INIT(cred_free, hook_cred_free), #ifdef CONFIG_AUDIT LSM_HOOK_INIT(bprm_creds_for_exec, hook_bprm_creds_for_exec), #endif /* CONFIG_AUDIT */ }; __init void landlock_add_cred_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); }
59 59 37 22 58 59 5300 5294 48 3 3 3 34 34 34 20 20 20 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 // SPDX-License-Identifier: GPL-2.0 /* * SafeSetID Linux Security Module * * Author: Micah Morton <mortonm@chromium.org> * * Copyright (C) 2018 The Chromium OS Authors. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, as * published by the Free Software Foundation. * */ #define pr_fmt(fmt) "SafeSetID: " fmt #include <linux/lsm_hooks.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <uapi/linux/lsm.h> #include "lsm.h" /* Flag indicating whether initialization completed */ int safesetid_initialized __initdata; struct setid_ruleset __rcu *safesetid_setuid_rules; struct setid_ruleset __rcu *safesetid_setgid_rules; /* Compute a decision for a transition from @src to @dst under @policy. */ enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy, kid_t src, kid_t dst) { struct setid_rule *rule; enum sid_policy_type result = SIDPOL_DEFAULT; if (policy->type == UID) { hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) { if (!uid_eq(rule->src_id.uid, src.uid)) continue; if (uid_eq(rule->dst_id.uid, dst.uid)) return SIDPOL_ALLOWED; result = SIDPOL_CONSTRAINED; } } else if (policy->type == GID) { hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) { if (!gid_eq(rule->src_id.gid, src.gid)) continue; if (gid_eq(rule->dst_id.gid, dst.gid)){ return SIDPOL_ALLOWED; } result = SIDPOL_CONSTRAINED; } } else { /* Should not reach here, report the ID as contrainsted */ result = SIDPOL_CONSTRAINED; } return result; } /* * Compute a decision for a transition from @src to @dst under the active * policy. */ static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type) { enum sid_policy_type result = SIDPOL_DEFAULT; struct setid_ruleset *pol; rcu_read_lock(); if (new_type == UID) pol = rcu_dereference(safesetid_setuid_rules); else if (new_type == GID) pol = rcu_dereference(safesetid_setgid_rules); else { /* Should not reach here */ result = SIDPOL_CONSTRAINED; rcu_read_unlock(); return result; } if (pol) { pol->type = new_type; result = _setid_policy_lookup(pol, src, dst); } rcu_read_unlock(); return result; } static int safesetid_security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { /* We're only interested in CAP_SETUID and CAP_SETGID. */ if (cap != CAP_SETUID && cap != CAP_SETGID) return 0; /* * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we * want to let it go through here; the real security check happens later, in * the task_fix_set{u/g}id or task_fix_setgroups hooks. */ if ((opts & CAP_OPT_INSETID) != 0) return 0; switch (cap) { case CAP_SETUID: /* * If no policy applies to this task, allow the use of CAP_SETUID for * other purposes. */ if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*uid() (e.g. setting up userns uid mappings). */ pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n", __kuid_val(cred->uid)); return -EPERM; case CAP_SETGID: /* * If no policy applies to this task, allow the use of CAP_SETGID for * other purposes. */ if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*gid() (e.g. setting up userns gid mappings). */ pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n", __kgid_val(cred->gid)); return -EPERM; default: /* Error, the only capabilities were checking for is CAP_SETUID/GID */ return 0; } return 0; } /* * Check whether a caller with old credentials @old is allowed to switch to * credentials that contain @new_id. */ static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type) { bool permitted; /* If our old creds already had this ID in it, it's fine. */ if (new_type == UID) { if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) || uid_eq(new_id.uid, old->suid)) return true; } else if (new_type == GID){ if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) || gid_eq(new_id.gid, old->sgid)) return true; } else /* Error, new_type is an invalid type */ return false; /* * Transitions to new UIDs require a check against the policy of the old * RUID. */ permitted = setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED; if (!permitted) { if (new_type == UID) { pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n", __kuid_val(old->uid), __kuid_val(old->euid), __kuid_val(old->suid), __kuid_val(new_id.uid)); } else if (new_type == GID) { pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n", __kgid_val(old->gid), __kgid_val(old->egid), __kgid_val(old->sgid), __kgid_val(new_id.gid)); } else /* Error, new_type is an invalid type */ return false; } return permitted; } /* * Check whether there is either an exception for user under old cred struct to * set*uid to user under new cred struct, or the UID transition is allowed (by * Linux set*uid rules) even without CAP_SETUID. */ static int safesetid_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setuid restrictions for our old RUID. */ if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old) { int i; /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; get_group_info(new->group_info); for (i = 0; i < new->group_info->ngroups; i++) { if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) { put_group_info(new->group_info); /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } } put_group_info(new->group_info); return 0; } static const struct lsm_id safesetid_lsmid = { .name = "safesetid", .id = LSM_ID_SAFESETID, }; static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid), LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups), LSM_HOOK_INIT(capable, safesetid_security_capable) }; static int __init safesetid_security_init(void) { security_add_hooks(safesetid_security_hooks, ARRAY_SIZE(safesetid_security_hooks), &safesetid_lsmid); /* Report that SafeSetID successfully initialized */ safesetid_initialized = 1; return 0; } DEFINE_LSM(safesetid_security_init) = { .id = &safesetid_lsmid, .init = safesetid_security_init, .initcall_fs = safesetid_init_securityfs, };
4 4 4 4 1 1 1 2 2 2 2 2 1 1 2 4 1 1 2 2 1 1 5 1 4 4 4 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 /* * userio kernel serio device emulation module * Copyright (C) 2015 Red Hat * Copyright (C) 2015 Stephen Chandler Paul <thatslyude@gmail.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser * General Public License for more details. */ #include <linux/circ_buf.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/serio.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/miscdevice.h> #include <linux/sched.h> #include <linux/poll.h> #include <uapi/linux/userio.h> #define USERIO_NAME "userio" #define USERIO_BUFSIZE 16 static struct miscdevice userio_misc; struct userio_device { struct serio *serio; struct mutex mutex; bool running; u8 head; u8 tail; spinlock_t buf_lock; unsigned char buf[USERIO_BUFSIZE]; wait_queue_head_t waitq; }; /** * userio_device_write - Write data from serio to a userio device in userspace * @id: The serio port for the userio device * @val: The data to write to the device */ static int userio_device_write(struct serio *id, unsigned char val) { struct userio_device *userio = id->port_data; scoped_guard(spinlock_irqsave, &userio->buf_lock) { userio->buf[userio->head] = val; userio->head = (userio->head + 1) % USERIO_BUFSIZE; if (userio->head == userio->tail) dev_warn(userio_misc.this_device, "Buffer overflowed, userio client isn't keeping up"); } wake_up_interruptible(&userio->waitq); return 0; } static int userio_char_open(struct inode *inode, struct file *file) { struct userio_device *userio __free(kfree) = kzalloc(sizeof(*userio), GFP_KERNEL); if (!userio) return -ENOMEM; mutex_init(&userio->mutex); spin_lock_init(&userio->buf_lock); init_waitqueue_head(&userio->waitq); userio->serio = kzalloc(sizeof(*userio->serio), GFP_KERNEL); if (!userio->serio) return -ENOMEM; userio->serio->write = userio_device_write; userio->serio->port_data = userio; file->private_data = no_free_ptr(userio); return 0; } static int userio_char_release(struct inode *inode, struct file *file) { struct userio_device *userio = file->private_data; if (userio->running) { /* * Don't free the serio port here, serio_unregister_port() * does it for us. */ serio_unregister_port(userio->serio); } else { kfree(userio->serio); } kfree(userio); return 0; } static size_t userio_fetch_data(struct userio_device *userio, u8 *buf, size_t count, size_t *copylen) { size_t available, len; guard(spinlock_irqsave)(&userio->buf_lock); available = CIRC_CNT_TO_END(userio->head, userio->tail, USERIO_BUFSIZE); len = min(available, count); if (len) { memcpy(buf, &userio->buf[userio->tail], len); userio->tail = (userio->tail + len) % USERIO_BUFSIZE; } *copylen = len; return available; } static ssize_t userio_char_read(struct file *file, char __user *user_buffer, size_t count, loff_t *ppos) { struct userio_device *userio = file->private_data; int error; size_t available, copylen; u8 buf[USERIO_BUFSIZE]; /* * By the time we get here, the data that was waiting might have * been taken by another thread. Grab the buffer lock and check if * there's still any data waiting, otherwise repeat this process * until we have data (unless the file descriptor is non-blocking * of course). */ for (;;) { available = userio_fetch_data(userio, buf, count, &copylen); if (available) break; /* buffer was/is empty */ if (file->f_flags & O_NONBLOCK) return -EAGAIN; /* * count == 0 is special - no IO is done but we check * for error conditions (see above). */ if (count == 0) return 0; error = wait_event_interruptible(userio->waitq, userio->head != userio->tail); if (error) return error; } if (copylen) if (copy_to_user(user_buffer, buf, copylen)) return -EFAULT; return copylen; } static int userio_execute_cmd(struct userio_device *userio, const struct userio_cmd *cmd) { switch (cmd->type) { case USERIO_CMD_REGISTER: if (!userio->serio->id.type) { dev_warn(userio_misc.this_device, "No port type given on /dev/userio\n"); return -EINVAL; } if (userio->running) { dev_warn(userio_misc.this_device, "Begin command sent, but we're already running\n"); return -EBUSY; } userio->running = true; serio_register_port(userio->serio); break; case USERIO_CMD_SET_PORT_TYPE: if (userio->running) { dev_warn(userio_misc.this_device, "Can't change port type on an already running userio instance\n"); return -EBUSY; } userio->serio->id.type = cmd->data; break; case USERIO_CMD_SEND_INTERRUPT: if (!userio->running) { dev_warn(userio_misc.this_device, "The device must be registered before sending interrupts\n"); return -ENODEV; } serio_interrupt(userio->serio, cmd->data, 0); break; default: return -EOPNOTSUPP; } return 0; } static ssize_t userio_char_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct userio_device *userio = file->private_data; struct userio_cmd cmd; int error; if (count != sizeof(cmd)) { dev_warn(userio_misc.this_device, "Invalid payload size\n"); return -EINVAL; } if (copy_from_user(&cmd, buffer, sizeof(cmd))) return -EFAULT; scoped_cond_guard(mutex_intr, return -EINTR, &userio->mutex) { error = userio_execute_cmd(userio, &cmd); if (error) return error; } return count; } static __poll_t userio_char_poll(struct file *file, poll_table *wait) { struct userio_device *userio = file->private_data; poll_wait(file, &userio->waitq, wait); if (userio->head != userio->tail) return EPOLLIN | EPOLLRDNORM; return 0; } static const struct file_operations userio_fops = { .owner = THIS_MODULE, .open = userio_char_open, .release = userio_char_release, .read = userio_char_read, .write = userio_char_write, .poll = userio_char_poll, }; static struct miscdevice userio_misc = { .fops = &userio_fops, .minor = USERIO_MINOR, .name = USERIO_NAME, }; module_driver(userio_misc, misc_register, misc_deregister); MODULE_ALIAS_MISCDEV(USERIO_MINOR); MODULE_ALIAS("devname:" USERIO_NAME); MODULE_AUTHOR("Stephen Chandler Paul <thatslyude@gmail.com>"); MODULE_DESCRIPTION("Virtual Serio Device Support"); MODULE_LICENSE("GPL");
2 2 63 62 64 64 64 64 63 64 64 64 226 3 2 4 2 3 228 141 54 2 2 8 8 4 64 75 57 57 56 57 55 57 55 56 17 12 63 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); /** * rdma_roce_rescan_port - Rescan all of the network devices in the system * and add their gids if relevant to the port of the RoCE device. * * @ib_dev: IB device * @port: Port number */ void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) { struct net_device *ndev = NULL; if (rdma_protocol_roce(ib_dev, port)) { ndev = ib_device_get_netdev(ib_dev, port); if (!ndev) return; enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); dev_put(ndev); } } EXPORT_SYMBOL(rdma_roce_rescan_port); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc(sizeof(*work), GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); }
49 50 13 13 13 10 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 // SPDX-License-Identifier: GPL-2.0-or-later /* auditsc.c -- System-call auditing support * Handles all system-call specific auditing features. * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright (C) 2005, 2006 IBM Corporation * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * * Many of the ideas implemented here are from Stephen C. Tweedie, * especially the idea of avoiding a copy by using getname. * * The method for actual interception of syscall entry and exit (not in * this file -- see entry.S) is based on a GPL'd patch written by * okir@suse.de and Copyright 2003 SuSE Linux AG. * * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>, * 2006. * * The support of additional filter rules compares (>, <, >=, <=) was * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. * * Modified by Amy Griffis <amy.griffis@hp.com> to collect additional * filesystem information. * * Subject and object context labeling support added by <danjones@us.ibm.com> * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <asm/types.h> #include <linux/atomic.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/mount.h> #include <linux/socket.h> #include <linux/mqueue.h> #include <linux/audit.h> #include <linux/personality.h> #include <linux/time.h> #include <linux/netlink.h> #include <linux/compiler.h> #include <asm/unistd.h> #include <linux/security.h> #include <linux/list.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <asm/syscall.h> #include <linux/capability.h> #include <linux/fs_struct.h> #include <linux/compat.h> #include <linux/ctype.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/fsnotify_backend.h> #include <uapi/linux/limits.h> #include <uapi/linux/netfilter/nf_tables.h> #include <uapi/linux/openat2.h> // struct open_how #include <uapi/linux/fanotify.h> #include "audit.h" /* flags stating the success for a syscall */ #define AUDITSC_INVALID 0 #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 /* no execve audit message should be longer than this (userspace limits), * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ #define MAX_PROCTITLE_AUDIT_LEN 128 /* number of audit rules */ int audit_n_rules; /* determines whether we collect data for signals sent */ int audit_signals; struct audit_aux_data { struct audit_aux_data *next; int type; }; /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; kuid_t target_auid[AUDIT_AUX_PIDS]; kuid_t target_uid[AUDIT_AUX_PIDS]; unsigned int target_sessionid[AUDIT_AUX_PIDS]; struct lsm_prop target_ref[AUDIT_AUX_PIDS]; char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; int pid_count; }; struct audit_aux_data_bprm_fcaps { struct audit_aux_data d; struct audit_cap_data fcap; unsigned int fcap_ver; struct audit_cap_data old_pcap; struct audit_cap_data new_pcap; }; struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; }; struct audit_nfcfgop_tab { enum audit_nfcfgop op; const char *s; }; static const struct audit_nfcfgop_tab audit_nfcfgs[] = { { AUDIT_XT_OP_REGISTER, "xt_register" }, { AUDIT_XT_OP_REPLACE, "xt_replace" }, { AUDIT_XT_OP_UNREGISTER, "xt_unregister" }, { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" }, { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" }, { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" }, { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" }, { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" }, { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" }, { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" }, { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" }, { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" }, { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" }, { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" }, { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" }, { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" }, { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" }, { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" }, { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" }, { AUDIT_NFT_OP_INVALID, "nft_invalid" }, }; static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; if (unlikely(!ctx)) return 0; n = ctx->major; switch (audit_classify_syscall(ctx->arch, n)) { case AUDITSC_NATIVE: if ((mask & AUDIT_PERM_WRITE) && audit_match_class(AUDIT_CLASS_WRITE, n)) return 1; if ((mask & AUDIT_PERM_READ) && audit_match_class(AUDIT_CLASS_READ, n)) return 1; if ((mask & AUDIT_PERM_ATTR) && audit_match_class(AUDIT_CLASS_CHATTR, n)) return 1; return 0; case AUDITSC_COMPAT: /* 32bit on biarch */ if ((mask & AUDIT_PERM_WRITE) && audit_match_class(AUDIT_CLASS_WRITE_32, n)) return 1; if ((mask & AUDIT_PERM_READ) && audit_match_class(AUDIT_CLASS_READ_32, n)) return 1; if ((mask & AUDIT_PERM_ATTR) && audit_match_class(AUDIT_CLASS_CHATTR_32, n)) return 1; return 0; case AUDITSC_OPEN: return mask & ACC_MODE(ctx->argv[1]); case AUDITSC_OPENAT: return mask & ACC_MODE(ctx->argv[2]); case AUDITSC_SOCKETCALL: return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND); case AUDITSC_EXECVE: return mask & AUDIT_PERM_EXEC; case AUDITSC_OPENAT2: return mask & ACC_MODE((u32)ctx->openat2.flags); default: return 0; } } static int audit_match_filetype(struct audit_context *ctx, int val) { struct audit_names *n; umode_t mode = (umode_t)val; if (unlikely(!ctx)) return 0; list_for_each_entry(n, &ctx->names_list, list) { if ((n->ino != AUDIT_INO_UNSET) && ((n->mode & S_IFMT) == mode)) return 1; } return 0; } /* * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; * ->first_trees points to its beginning, ->trees - to the current end of data. * ->tree_count is the number of free entries in array pointed to by ->trees. * Original condition is (NULL, NULL, 0); as soon as it grows we never revert to NULL, * "empty" becomes (p, p, 31) afterwards. We don't shrink the list (and seriously, * it's going to remain 1-element for almost any setup) until we free context itself. * References in it _are_ dropped - at the same time we free/drop aux stuff. */ static void audit_set_auditable(struct audit_context *ctx) { if (!ctx->prio) { ctx->prio = 1; ctx->current_state = AUDIT_STATE_RECORD; } } static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk) { struct audit_tree_refs *p = ctx->trees; int left = ctx->tree_count; if (likely(left)) { p->c[--left] = chunk; ctx->tree_count = left; return 1; } if (!p) return 0; p = p->next; if (p) { p->c[30] = chunk; ctx->trees = p; ctx->tree_count = 30; return 1; } return 0; } static int grow_tree_refs(struct audit_context *ctx) { struct audit_tree_refs *p = ctx->trees; ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL); if (!ctx->trees) { ctx->trees = p; return 0; } if (p) p->next = ctx->trees; else ctx->first_trees = ctx->trees; ctx->tree_count = 31; return 1; } static void unroll_tree_refs(struct audit_context *ctx, struct audit_tree_refs *p, int count) { struct audit_tree_refs *q; int n; if (!p) { /* we started with empty chain */ p = ctx->first_trees; count = 31; /* if the very first allocation has failed, nothing to do */ if (!p) return; } n = count; for (q = p; q != ctx->trees; q = q->next, n = 31) { while (n--) { audit_put_chunk(q->c[n]); q->c[n] = NULL; } } while (n-- > ctx->tree_count) { audit_put_chunk(q->c[n]); q->c[n] = NULL; } ctx->trees = p; ctx->tree_count = count; } static void free_tree_refs(struct audit_context *ctx) { struct audit_tree_refs *p, *q; for (p = ctx->first_trees; p; p = q) { q = p->next; kfree(p); } } static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) { struct audit_tree_refs *p; int n; if (!tree) return 0; /* full ones */ for (p = ctx->first_trees; p != ctx->trees; p = p->next) { for (n = 0; n < 31; n++) if (audit_tree_match(p->c[n], tree)) return 1; } /* partial */ if (p) { for (n = ctx->tree_count; n < 31; n++) if (audit_tree_match(p->c[n], tree)) return 1; } return 0; } static int audit_compare_uid(kuid_t uid, struct audit_names *name, struct audit_field *f, struct audit_context *ctx) { struct audit_names *n; int rc; if (name) { rc = audit_uid_comparator(uid, f->op, name->uid); if (rc) return rc; } if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { rc = audit_uid_comparator(uid, f->op, n->uid); if (rc) return rc; } } return 0; } static int audit_compare_gid(kgid_t gid, struct audit_names *name, struct audit_field *f, struct audit_context *ctx) { struct audit_names *n; int rc; if (name) { rc = audit_gid_comparator(gid, f->op, name->gid); if (rc) return rc; } if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { rc = audit_gid_comparator(gid, f->op, n->gid); if (rc) return rc; } } return 0; } static int audit_field_compare(struct task_struct *tsk, const struct cred *cred, struct audit_field *f, struct audit_context *ctx, struct audit_names *name) { switch (f->val) { /* process to file object comparisons */ case AUDIT_COMPARE_UID_TO_OBJ_UID: return audit_compare_uid(cred->uid, name, f, ctx); case AUDIT_COMPARE_GID_TO_OBJ_GID: return audit_compare_gid(cred->gid, name, f, ctx); case AUDIT_COMPARE_EUID_TO_OBJ_UID: return audit_compare_uid(cred->euid, name, f, ctx); case AUDIT_COMPARE_EGID_TO_OBJ_GID: return audit_compare_gid(cred->egid, name, f, ctx); case AUDIT_COMPARE_AUID_TO_OBJ_UID: return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx); case AUDIT_COMPARE_SUID_TO_OBJ_UID: return audit_compare_uid(cred->suid, name, f, ctx); case AUDIT_COMPARE_SGID_TO_OBJ_GID: return audit_compare_gid(cred->sgid, name, f, ctx); case AUDIT_COMPARE_FSUID_TO_OBJ_UID: return audit_compare_uid(cred->fsuid, name, f, ctx); case AUDIT_COMPARE_FSGID_TO_OBJ_GID: return audit_compare_gid(cred->fsgid, name, f, ctx); /* uid comparisons */ case AUDIT_COMPARE_UID_TO_AUID: return audit_uid_comparator(cred->uid, f->op, audit_get_loginuid(tsk)); case AUDIT_COMPARE_UID_TO_EUID: return audit_uid_comparator(cred->uid, f->op, cred->euid); case AUDIT_COMPARE_UID_TO_SUID: return audit_uid_comparator(cred->uid, f->op, cred->suid); case AUDIT_COMPARE_UID_TO_FSUID: return audit_uid_comparator(cred->uid, f->op, cred->fsuid); /* auid comparisons */ case AUDIT_COMPARE_AUID_TO_EUID: return audit_uid_comparator(audit_get_loginuid(tsk), f->op, cred->euid); case AUDIT_COMPARE_AUID_TO_SUID: return audit_uid_comparator(audit_get_loginuid(tsk), f->op, cred->suid); case AUDIT_COMPARE_AUID_TO_FSUID: return audit_uid_comparator(audit_get_loginuid(tsk), f->op, cred->fsuid); /* euid comparisons */ case AUDIT_COMPARE_EUID_TO_SUID: return audit_uid_comparator(cred->euid, f->op, cred->suid); case AUDIT_COMPARE_EUID_TO_FSUID: return audit_uid_comparator(cred->euid, f->op, cred->fsuid); /* suid comparisons */ case AUDIT_COMPARE_SUID_TO_FSUID: return audit_uid_comparator(cred->suid, f->op, cred->fsuid); /* gid comparisons */ case AUDIT_COMPARE_GID_TO_EGID: return audit_gid_comparator(cred->gid, f->op, cred->egid); case AUDIT_COMPARE_GID_TO_SGID: return audit_gid_comparator(cred->gid, f->op, cred->sgid); case AUDIT_COMPARE_GID_TO_FSGID: return audit_gid_comparator(cred->gid, f->op, cred->fsgid); /* egid comparisons */ case AUDIT_COMPARE_EGID_TO_SGID: return audit_gid_comparator(cred->egid, f->op, cred->sgid); case AUDIT_COMPARE_EGID_TO_FSGID: return audit_gid_comparator(cred->egid, f->op, cred->fsgid); /* sgid comparison */ case AUDIT_COMPARE_SGID_TO_FSGID: return audit_gid_comparator(cred->sgid, f->op, cred->fsgid); default: WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); return 0; } return 0; } /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. * * If task_creation is true, this is an explicit indication that we are * filtering a task rule at task creation time. This and tsk == current are * the only situations where tsk->cred may be accessed without an rcu read lock. */ static int audit_filter_rules(struct task_struct *tsk, struct audit_krule *rule, struct audit_context *ctx, struct audit_names *name, enum audit_state *state, bool task_creation) { const struct cred *cred; int i, need_sid = 1; struct lsm_prop prop = { }; unsigned int sessionid; if (ctx && rule->prio <= ctx->prio) return 0; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; struct audit_names *n; int result = 0; pid_t pid; switch (f->type) { case AUDIT_PID: pid = task_tgid_nr(tsk); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_PPID: if (ctx) { if (!ctx->ppid) ctx->ppid = task_ppid_nr(tsk); result = audit_comparator(ctx->ppid, f->op, f->val); } break; case AUDIT_EXE: result = audit_exe_compare(tsk, rule->exe); if (f->op == Audit_not_equal) result = !result; break; case AUDIT_UID: result = audit_uid_comparator(cred->uid, f->op, f->uid); break; case AUDIT_EUID: result = audit_uid_comparator(cred->euid, f->op, f->uid); break; case AUDIT_SUID: result = audit_uid_comparator(cred->suid, f->op, f->uid); break; case AUDIT_FSUID: result = audit_uid_comparator(cred->fsuid, f->op, f->uid); break; case AUDIT_GID: result = audit_gid_comparator(cred->gid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_EGID: result = audit_gid_comparator(cred->egid, f->op, f->gid); if (f->op == Audit_equal) { if (!result) result = groups_search(cred->group_info, f->gid); } else if (f->op == Audit_not_equal) { if (result) result = !groups_search(cred->group_info, f->gid); } break; case AUDIT_SGID: result = audit_gid_comparator(cred->sgid, f->op, f->gid); break; case AUDIT_FSGID: result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; case AUDIT_SESSIONID: sessionid = audit_get_sessionid(tsk); result = audit_comparator(sessionid, f->op, f->val); break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); break; case AUDIT_ARCH: if (ctx) result = audit_comparator(ctx->arch, f->op, f->val); break; case AUDIT_EXIT: if (ctx && ctx->return_valid != AUDITSC_INVALID) result = audit_comparator(ctx->return_code, f->op, f->val); break; case AUDIT_SUCCESS: if (ctx && ctx->return_valid != AUDITSC_INVALID) { if (f->val) result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); else result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE); } break; case AUDIT_DEVMAJOR: if (name) { if (audit_comparator(MAJOR(name->dev), f->op, f->val) || audit_comparator(MAJOR(name->rdev), f->op, f->val)) ++result; } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(MAJOR(n->dev), f->op, f->val) || audit_comparator(MAJOR(n->rdev), f->op, f->val)) { ++result; break; } } } break; case AUDIT_DEVMINOR: if (name) { if (audit_comparator(MINOR(name->dev), f->op, f->val) || audit_comparator(MINOR(name->rdev), f->op, f->val)) ++result; } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(MINOR(n->dev), f->op, f->val) || audit_comparator(MINOR(n->rdev), f->op, f->val)) { ++result; break; } } } break; case AUDIT_INODE: if (name) result = audit_comparator(name->ino, f->op, f->val); else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(n->ino, f->op, f->val)) { ++result; break; } } } break; case AUDIT_OBJ_UID: if (name) { result = audit_uid_comparator(name->uid, f->op, f->uid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_uid_comparator(n->uid, f->op, f->uid)) { ++result; break; } } } break; case AUDIT_OBJ_GID: if (name) { result = audit_gid_comparator(name->gid, f->op, f->gid); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_gid_comparator(n->gid, f->op, f->gid)) { ++result; break; } } } break; case AUDIT_WATCH: if (name) { result = audit_watch_compare(rule->watch, name->ino, name->dev); if (f->op == Audit_not_equal) result = !result; } break; case AUDIT_DIR: if (ctx) { result = match_tree_refs(ctx, rule->tree); if (f->op == Audit_not_equal) result = !result; } break; case AUDIT_LOGINUID: result = audit_uid_comparator(audit_get_loginuid(tsk), f->op, f->uid); break; case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); break; case AUDIT_SADDR_FAM: if (ctx && ctx->sockaddr) result = audit_comparator(ctx->sockaddr->ss_family, f->op, f->val); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: /* NOTE: this may return negative values indicating a temporary error. We simply treat this as a match for now to avoid losing information that may be wanted. An error message will also be logged upon error */ if (f->lsm_rule) { if (need_sid) { /* @tsk should always be equal to * @current with the exception of * fork()/copy_process() in which case * the new @tsk creds are still a dup * of @current's creds so we can still * use * security_current_getlsmprop_subj() * here even though it always refs * @current's creds */ security_current_getlsmprop_subj(&prop); need_sid = 0; } result = security_audit_rule_match(&prop, f->type, f->op, f->lsm_rule); } break; case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR also applies here */ if (f->lsm_rule) { /* Find files that match */ if (name) { result = security_audit_rule_match( &name->oprop, f->type, f->op, f->lsm_rule); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (security_audit_rule_match( &n->oprop, f->type, f->op, f->lsm_rule)) { ++result; break; } } } /* Find ipc objects that match */ if (!ctx || ctx->type != AUDIT_IPC) break; if (security_audit_rule_match(&ctx->ipc.oprop, f->type, f->op, f->lsm_rule)) ++result; } break; case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: if (ctx) result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); break; case AUDIT_FILTERKEY: /* ignore this field for filtering */ result = 1; break; case AUDIT_PERM: result = audit_match_perm(ctx, f->val); if (f->op == Audit_not_equal) result = !result; break; case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); if (f->op == Audit_not_equal) result = !result; break; case AUDIT_FIELD_COMPARE: result = audit_field_compare(tsk, cred, f, ctx, name); break; } if (!result) return 0; } if (ctx) { if (rule->filterkey) { kfree(ctx->filterkey); ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); } ctx->prio = rule->prio; } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_STATE_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_STATE_RECORD; break; } return 1; } /* At process creation time, we can determine if system-call auditing is * completely disabled for this task. Since we only have the task * structure at this point, we can only check uid and gid. */ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) { struct audit_entry *e; enum audit_state state; rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state, true)) { if (state == AUDIT_STATE_RECORD) *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); rcu_read_unlock(); return state; } } rcu_read_unlock(); return AUDIT_STATE_BUILD; } static int audit_in_mask(const struct audit_krule *rule, unsigned long val) { int word, bit; if (val > 0xffffffff) return false; word = AUDIT_WORD(val); if (word >= AUDIT_BITMASK_SIZE) return false; bit = AUDIT_BIT(val); return rule->mask[word] & bit; } /** * __audit_filter_op - common filter helper for operations (syscall/uring/etc) * @tsk: associated task * @ctx: audit context * @list: audit filter list * @name: audit_name (can be NULL) * @op: current syscall/uring_op * * Run the udit filters specified in @list against @tsk using @ctx, * @name, and @op, as necessary; the caller is responsible for ensuring * that the call is made while the RCU read lock is held. The @name * parameter can be NULL, but all others must be specified. * Returns 1/true if the filter finds a match, 0/false if none are found. */ static int __audit_filter_op(struct task_struct *tsk, struct audit_context *ctx, struct list_head *list, struct audit_names *name, unsigned long op) { struct audit_entry *e; enum audit_state state; list_for_each_entry_rcu(e, list, list) { if (audit_in_mask(&e->rule, op) && audit_filter_rules(tsk, &e->rule, ctx, name, &state, false)) { ctx->current_state = state; return 1; } } return 0; } /** * audit_filter_uring - apply filters to an io_uring operation * @tsk: associated task * @ctx: audit context */ static void audit_filter_uring(struct task_struct *tsk, struct audit_context *ctx) { if (auditd_test_task(tsk)) return; rcu_read_lock(); __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT], NULL, ctx->uring_op); rcu_read_unlock(); } /* At syscall exit time, this filter is called if the audit_state is * not low enough that auditing cannot take place, but is also not * high enough that we already know we have to write an audit record * (i.e., the state is AUDIT_STATE_BUILD). */ static void audit_filter_syscall(struct task_struct *tsk, struct audit_context *ctx) { if (auditd_test_task(tsk)) return; rcu_read_lock(); __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT], NULL, ctx->major); rcu_read_unlock(); } /* * Given an audit_name check the inode hash table to see if they match. * Called holding the rcu read lock to protect the use of audit_inode_hash */ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, struct audit_context *ctx) { int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; return __audit_filter_op(tsk, ctx, list, n, ctx->major); } /* At syscall exit time, this filter is called if any audit_names have been * collected during syscall processing. We only check rules in sublists at hash * buckets applicable to the inode numbers in audit_names. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { struct audit_names *n; if (auditd_test_task(tsk)) return; rcu_read_lock(); list_for_each_entry(n, &ctx->names_list, list) { if (audit_filter_inode_name(tsk, n, ctx)) break; } rcu_read_unlock(); } static inline void audit_proctitle_free(struct audit_context *context) { kfree(context->proctitle.value); context->proctitle.value = NULL; context->proctitle.len = 0; } static inline void audit_free_module(struct audit_context *context) { if (context->type == AUDIT_KERN_MODULE) { kfree(context->module.name); context->module.name = NULL; } } static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; list_for_each_entry_safe(n, next, &context->names_list, list) { list_del(&n->list); if (n->name) putname(n->name); if (n->should_free) kfree(n); } context->name_count = 0; path_put(&context->pwd); context->pwd.dentry = NULL; context->pwd.mnt = NULL; } static inline void audit_free_aux(struct audit_context *context) { struct audit_aux_data *aux; while ((aux = context->aux)) { context->aux = aux->next; kfree(aux); } context->aux = NULL; while ((aux = context->aux_pids)) { context->aux_pids = aux->next; kfree(aux); } context->aux_pids = NULL; } /** * audit_reset_context - reset a audit_context structure * @ctx: the audit_context to reset * * All fields in the audit_context will be reset to an initial state, all * references held by fields will be dropped, and private memory will be * released. When this function returns the audit_context will be suitable * for reuse, so long as the passed context is not NULL or a dummy context. */ static void audit_reset_context(struct audit_context *ctx) { if (!ctx) return; /* if ctx is non-null, reset the "ctx->context" regardless */ ctx->context = AUDIT_CTX_UNUSED; if (ctx->dummy) return; /* * NOTE: It shouldn't matter in what order we release the fields, so * release them in the order in which they appear in the struct; * this gives us some hope of quickly making sure we are * resetting the audit_context properly. * * Other things worth mentioning: * - we don't reset "dummy" * - we don't reset "state", we do reset "current_state" * - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD * - much of this is likely overkill, but play it safe for now * - we really need to work on improving the audit_context struct */ ctx->current_state = ctx->state; ctx->stamp.serial = 0; ctx->stamp.ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; ctx->major = 0; ctx->uring_op = 0; memset(ctx->argv, 0, sizeof(ctx->argv)); ctx->return_code = 0; ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0); ctx->return_valid = AUDITSC_INVALID; audit_free_names(ctx); if (ctx->state != AUDIT_STATE_RECORD) { kfree(ctx->filterkey); ctx->filterkey = NULL; } audit_free_aux(ctx); kfree(ctx->sockaddr); ctx->sockaddr = NULL; ctx->sockaddr_len = 0; ctx->ppid = 0; ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0); ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0); ctx->personality = 0; ctx->arch = 0; ctx->target_pid = 0; ctx->target_auid = ctx->target_uid = KUIDT_INIT(0); ctx->target_sessionid = 0; lsmprop_init(&ctx->target_ref); ctx->target_comm[0] = '\0'; unroll_tree_refs(ctx, NULL, 0); WARN_ON(!list_empty(&ctx->killed_trees)); audit_free_module(ctx); ctx->fds[0] = -1; ctx->type = 0; /* reset last for audit_free_*() */ } static inline struct audit_context *audit_alloc_context(enum audit_state state) { struct audit_context *context; context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return NULL; context->context = AUDIT_CTX_UNUSED; context->state = state; context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); context->fds[0] = -1; context->return_valid = AUDITSC_INVALID; return context; } /** * audit_alloc - allocate an audit context block for a task * @tsk: task * * Filter on the task information and allocate a per-task audit context * if necessary. Doing so turns on system call auditing for the * specified task. This is called from copy_process, so no lock is * needed. */ int audit_alloc(struct task_struct *tsk) { struct audit_context *context; enum audit_state state; char *key = NULL; if (likely(!audit_ever_enabled)) return 0; state = audit_filter_task(tsk, &key); if (state == AUDIT_STATE_DISABLED) { clear_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } context = audit_alloc_context(state); if (!context) { kfree(key); audit_log_lost("out of memory in audit_alloc"); return -ENOMEM; } context->filterkey = key; audit_set_context(tsk, context); set_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } static inline void audit_free_context(struct audit_context *context) { /* resetting is extra work, but it is likely just noise */ audit_reset_context(context); audit_proctitle_free(context); free_tree_refs(context); kfree(context->filterkey); kfree(context); } static int audit_log_pid_context(struct audit_context *context, pid_t pid, kuid_t auid, kuid_t uid, unsigned int sessionid, struct lsm_prop *prop, char *comm) { struct audit_buffer *ab; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); if (!ab) return rc; audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop)) rc = 1; audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); return rc; } static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab) { long len_max; long len_rem; long len_full; long len_buf; long len_abuf = 0; long len_tmp; bool require_data; bool encode; unsigned int iter; unsigned int arg; char *buf_head; char *buf; const char __user *p = (const char __user *)current->mm->arg_start; /* NOTE: this buffer needs to be large enough to hold all the non-arg * data we put in the audit record for this argument (see the * code below) ... at this point in time 96 is plenty */ char abuf[96]; /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the * current value of 7500 is not as important as the fact that it * is less than 8k, a setting of 7500 gives us plenty of wiggle * room if we go over a little bit in the logging below */ WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); len_max = MAX_EXECVE_AUDIT_LEN; /* scratch buffer to hold the userspace args */ buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); if (!buf_head) { audit_panic("out of memory for argv string"); return; } buf = buf_head; audit_log_format(*ab, "argc=%d", context->execve.argc); len_rem = len_max; len_buf = 0; len_full = 0; require_data = true; encode = false; iter = 0; arg = 0; do { /* NOTE: we don't ever want to trust this value for anything * serious, but the audit record format insists we * provide an argument length for really long arguments, * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but * to use strncpy_from_user() to obtain this value for * recording in the log, although we don't use it * anywhere here to avoid a double-fetch problem */ if (len_full == 0) len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; /* read more data from userspace */ if (require_data) { /* can we make more room in the buffer? */ if (buf != buf_head) { memmove(buf_head, buf, len_buf); buf = buf_head; } /* fetch as much as we can of the argument */ len_tmp = strncpy_from_user(&buf_head[len_buf], p, len_max - len_buf); if (len_tmp == -EFAULT) { /* unable to copy from userspace */ send_sig(SIGKILL, current, 0); goto out; } else if (len_tmp == (len_max - len_buf)) { /* buffer is not large enough */ require_data = true; /* NOTE: if we are going to span multiple * buffers force the encoding so we stand * a chance at a sane len_full value and * consistent record encoding */ encode = true; len_full = len_full * 2; p += len_tmp; } else { require_data = false; if (!encode) encode = audit_string_contains_control( buf, len_tmp); /* try to use a trusted value for len_full */ if (len_full < len_max) len_full = (encode ? len_tmp * 2 : len_tmp); p += len_tmp + 1; } len_buf += len_tmp; buf_head[len_buf] = '\0'; /* length of the buffer in the audit record? */ len_abuf = (encode ? len_buf * 2 : len_buf + 2); } /* write as much as we can to the audit log */ if (len_buf >= 0) { /* NOTE: some magic numbers here - basically if we * can't fit a reasonable amount of data into the * existing audit buffer, flush it and start with * a new buffer */ if ((sizeof(abuf) + 8) > len_rem) { len_rem = len_max; audit_log_end(*ab); *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); if (!*ab) goto out; } /* create the non-arg portion of the arg record */ len_tmp = 0; if (require_data || (iter > 0) || ((len_abuf + sizeof(abuf)) > len_rem)) { if (iter == 0) { len_tmp += snprintf(&abuf[len_tmp], sizeof(abuf) - len_tmp, " a%d_len=%lu", arg, len_full); } len_tmp += snprintf(&abuf[len_tmp], sizeof(abuf) - len_tmp, " a%d[%d]=", arg, iter++); } else len_tmp += snprintf(&abuf[len_tmp], sizeof(abuf) - len_tmp, " a%d=", arg); WARN_ON(len_tmp >= sizeof(abuf)); abuf[sizeof(abuf) - 1] = '\0'; /* log the arg in the audit record */ audit_log_format(*ab, "%s", abuf); len_rem -= len_tmp; len_tmp = len_buf; if (encode) { if (len_abuf > len_rem) len_tmp = len_rem / 2; /* encoding */ audit_log_n_hex(*ab, buf, len_tmp); len_rem -= len_tmp * 2; len_abuf -= len_tmp * 2; } else { if (len_abuf > len_rem) len_tmp = len_rem - 2; /* quotes */ audit_log_n_string(*ab, buf, len_tmp); len_rem -= len_tmp + 2; /* don't subtract the "2" because we still need * to add quotes to the remaining string */ len_abuf -= len_tmp; } len_buf -= len_tmp; buf += len_tmp; } /* ready to move to the next argument? */ if ((len_buf == 0) && !require_data) { arg++; iter = 0; len_full = 0; require_data = true; encode = false; } } while (arg < context->execve.argc); /* NOTE: the caller handles the final audit_log_end() call */ out: kfree(buf_head); } static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) { if (cap_isclear(*cap)) { audit_log_format(ab, " %s=0", prefix); return; } audit_log_format(ab, " %s=%016llx", prefix, cap->val); } static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) { if (name->fcap_ver == -1) { audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?"); return; } audit_log_cap(ab, "cap_fp", &name->fcap.permitted); audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d", name->fcap.fE, name->fcap_ver, from_kuid(&init_user_ns, name->fcap.rootid)); } static void audit_log_time(struct audit_context *context, struct audit_buffer **ab) { const struct audit_ntp_data *ntp = &context->time.ntp_data; const struct timespec64 *tk = &context->time.tk_injoffset; static const char * const ntp_name[] = { "offset", "freq", "status", "tai", "tick", "adjust", }; int type; if (context->type == AUDIT_TIME_ADJNTPVAL) { for (type = 0; type < AUDIT_NTP_NVALS; type++) { if (ntp->vals[type].newval != ntp->vals[type].oldval) { if (!*ab) { *ab = audit_log_start(context, GFP_KERNEL, AUDIT_TIME_ADJNTPVAL); if (!*ab) return; } audit_log_format(*ab, "op=%s old=%lli new=%lli", ntp_name[type], ntp->vals[type].oldval, ntp->vals[type].newval); audit_log_end(*ab); *ab = NULL; } } } if (tk->tv_sec != 0 || tk->tv_nsec != 0) { if (!*ab) { *ab = audit_log_start(context, GFP_KERNEL, AUDIT_TIME_INJOFFSET); if (!*ab) return; } audit_log_format(*ab, "sec=%lli nsec=%li", (long long)tk->tv_sec, tk->tv_nsec); audit_log_end(*ab); *ab = NULL; } } static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; int i; ab = audit_log_start(context, GFP_KERNEL, context->type); if (!ab) return; switch (context->type) { case AUDIT_SOCKETCALL: { int nargs = context->socketcall.nargs; audit_log_format(ab, "nargs=%d", nargs); for (i = 0; i < nargs; i++) audit_log_format(ab, " a%d=%lx", i, context->socketcall.args[i]); break; } case AUDIT_IPC: audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", from_kuid(&init_user_ns, context->ipc.uid), from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { if (audit_log_obj_ctx(ab, &context->ipc.oprop)) *call_panic = 1; } if (context->ipc.has_perm) { audit_log_end(ab); ab = audit_log_start(context, GFP_KERNEL, AUDIT_IPC_SET_PERM); if (unlikely(!ab)) return; audit_log_format(ab, "qbytes=%lx ouid=%u ogid=%u mode=%#ho", context->ipc.qbytes, context->ipc.perm_uid, context->ipc.perm_gid, context->ipc.perm_mode); } break; case AUDIT_MQ_OPEN: audit_log_format(ab, "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " "mq_msgsize=%ld mq_curmsgs=%ld", context->mq_open.oflag, context->mq_open.mode, context->mq_open.attr.mq_flags, context->mq_open.attr.mq_maxmsg, context->mq_open.attr.mq_msgsize, context->mq_open.attr.mq_curmsgs); break; case AUDIT_MQ_SENDRECV: audit_log_format(ab, "mqdes=%d msg_len=%zd msg_prio=%u " "abs_timeout_sec=%lld abs_timeout_nsec=%ld", context->mq_sendrecv.mqdes, context->mq_sendrecv.msg_len, context->mq_sendrecv.msg_prio, (long long) context->mq_sendrecv.abs_timeout.tv_sec, context->mq_sendrecv.abs_timeout.tv_nsec); break; case AUDIT_MQ_NOTIFY: audit_log_format(ab, "mqdes=%d sigev_signo=%d", context->mq_notify.mqdes, context->mq_notify.sigev_signo); break; case AUDIT_MQ_GETSETATTR: { struct mq_attr *attr = &context->mq_getsetattr.mqstat; audit_log_format(ab, "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " "mq_curmsgs=%ld ", context->mq_getsetattr.mqdes, attr->mq_flags, attr->mq_maxmsg, attr->mq_msgsize, attr->mq_curmsgs); break; } case AUDIT_CAPSET: audit_log_format(ab, "pid=%d", context->capset.pid); audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient); break; case AUDIT_MMAP: audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, context->mmap.flags); break; case AUDIT_OPENAT2: audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx", context->openat2.flags, context->openat2.mode, context->openat2.resolve); break; case AUDIT_EXECVE: audit_log_execve_info(context, &ab); break; case AUDIT_KERN_MODULE: audit_log_format(ab, "name="); if (context->module.name) { audit_log_untrustedstring(ab, context->module.name); } else audit_log_format(ab, "(null)"); break; case AUDIT_TIME_ADJNTPVAL: case AUDIT_TIME_INJOFFSET: /* this call deviates from the rest, eating the buffer */ audit_log_time(context, &ab); break; } audit_log_end(ab); } static inline int audit_proctitle_rtrim(char *proctitle, int len) { char *end = proctitle + len - 1; while (end > proctitle && !isprint(*end)) end--; /* catch the case where proctitle is only 1 non-print character */ len = end - proctitle + 1; len -= isprint(proctitle[len-1]) == 0; return len; } /* * audit_log_name - produce AUDIT_PATH record from struct audit_names * @context: audit_context for the task * @n: audit_names structure with reportable details * @path: optional path to report instead of audit_names->name * @record_num: record number to report when handling a list of names * @call_panic: optional pointer to int that will be updated if secid fails */ static void audit_log_name(struct audit_context *context, struct audit_names *n, const struct path *path, int record_num, int *call_panic) { struct audit_buffer *ab; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); if (!ab) return; audit_log_format(ab, "item=%d", record_num); if (path) audit_log_d_path(ab, " name=", path); else if (n->name) { switch (n->name_len) { case AUDIT_NAME_FULL: /* log the full path */ audit_log_format(ab, " name="); audit_log_untrustedstring(ab, n->name->name); break; case 0: /* name was specified as a relative path and the * directory component is the cwd */ if (context->pwd.dentry && context->pwd.mnt) audit_log_d_path(ab, " name=", &context->pwd); else audit_log_format(ab, " name=(null)"); break; default: /* log the name's directory component */ audit_log_format(ab, " name="); audit_log_n_untrustedstring(ab, n->name->name, n->name_len); } } else audit_log_format(ab, " name=(null)"); if (n->ino != AUDIT_INO_UNSET) audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", n->ino, MAJOR(n->dev), MINOR(n->dev), n->mode, from_kuid(&init_user_ns, n->uid), from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); if (lsmprop_is_set(&n->oprop) && audit_log_obj_ctx(ab, &n->oprop)) *call_panic = 2; /* log the audit_names record type */ switch (n->type) { case AUDIT_TYPE_NORMAL: audit_log_format(ab, " nametype=NORMAL"); break; case AUDIT_TYPE_PARENT: audit_log_format(ab, " nametype=PARENT"); break; case AUDIT_TYPE_CHILD_DELETE: audit_log_format(ab, " nametype=DELETE"); break; case AUDIT_TYPE_CHILD_CREATE: audit_log_format(ab, " nametype=CREATE"); break; default: audit_log_format(ab, " nametype=UNKNOWN"); break; } audit_log_fcaps(ab, n); audit_log_end(ab); } static void audit_log_proctitle(void) { int res; char *buf; char *msg = "(null)"; int len = strlen(msg); struct audit_context *context = audit_context(); struct audit_buffer *ab; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ audit_log_format(ab, "proctitle="); /* Not cached */ if (!context->proctitle.value) { buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL); if (!buf) goto out; /* Historically called this from procfs naming */ res = get_cmdline(current, buf, MAX_PROCTITLE_AUDIT_LEN); if (res == 0) { kfree(buf); goto out; } res = audit_proctitle_rtrim(buf, res); if (res == 0) { kfree(buf); goto out; } context->proctitle.value = buf; context->proctitle.len = res; } msg = context->proctitle.value; len = context->proctitle.len; out: audit_log_n_untrustedstring(ab, msg, len); audit_log_end(ab); } /** * audit_log_uring - generate a AUDIT_URINGOP record * @ctx: the audit context */ static void audit_log_uring(struct audit_context *ctx) { struct audit_buffer *ab; const struct cred *cred; ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP); if (!ab) return; cred = current_cred(); audit_log_format(ab, "uring_op=%d", ctx->uring_op); if (ctx->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", str_yes_no(ctx->return_valid == AUDITSC_SUCCESS), ctx->return_code); audit_log_format(ab, " items=%d" " ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u" " fsuid=%u egid=%u sgid=%u fsgid=%u", ctx->name_count, task_ppid_nr(current), task_tgid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kgid(&init_user_ns, cred->gid), from_kuid(&init_user_ns, cred->euid), from_kuid(&init_user_ns, cred->suid), from_kuid(&init_user_ns, cred->fsuid), from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid)); audit_log_task_context(ab); audit_log_key(ab, ctx->filterkey); audit_log_end(ab); } static void audit_log_exit(void) { int i, call_panic = 0; struct audit_context *context = audit_context(); struct audit_buffer *ab; struct audit_aux_data *aux; struct audit_names *n; context->personality = current->personality; switch (context->context) { case AUDIT_CTX_SYSCALL: ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); if (!ab) return; audit_log_format(ab, "arch=%x syscall=%d", context->arch, context->major); if (context->personality != PER_LINUX) audit_log_format(ab, " per=%lx", context->personality); if (context->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", str_yes_no(context->return_valid == AUDITSC_SUCCESS), context->return_code); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", context->argv[0], context->argv[1], context->argv[2], context->argv[3], context->name_count); audit_log_task_info(ab); audit_log_key(ab, context->filterkey); audit_log_end(ab); break; case AUDIT_CTX_URING: audit_log_uring(context); break; default: BUG(); break; } for (aux = context->aux; aux; aux = aux->next) { ab = audit_log_start(context, GFP_KERNEL, aux->type); if (!ab) continue; /* audit_panic has been called */ switch (aux->type) { case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); audit_log_cap(ab, "fp", &axs->fcap.permitted); audit_log_cap(ab, "fi", &axs->fcap.inheritable); audit_log_format(ab, " fe=%d", axs->fcap.fE); audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient); audit_log_cap(ab, "pp", &axs->new_pcap.permitted); audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); audit_log_cap(ab, "pe", &axs->new_pcap.effective); audit_log_cap(ab, "pa", &axs->new_pcap.ambient); audit_log_format(ab, " frootid=%d", from_kuid(&init_user_ns, axs->fcap.rootid)); break; } } audit_log_end(ab); } if (context->type) show_special(context, &call_panic); if (context->fds[0] >= 0) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR); if (ab) { audit_log_format(ab, "fd0=%d fd1=%d", context->fds[0], context->fds[1]); audit_log_end(ab); } } if (context->sockaddr_len) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); if (ab) { audit_log_format(ab, "saddr="); audit_log_n_hex(ab, (void *)context->sockaddr, context->sockaddr_len); audit_log_end(ab); } } for (aux = context->aux_pids; aux; aux = aux->next) { struct audit_aux_data_pids *axs = (void *)aux; for (i = 0; i < axs->pid_count; i++) if (audit_log_pid_context(context, axs->target_pid[i], axs->target_auid[i], axs->target_uid[i], axs->target_sessionid[i], &axs->target_ref[i], axs->target_comm[i])) call_panic = 1; } if (context->target_pid && audit_log_pid_context(context, context->target_pid, context->target_auid, context->target_uid, context->target_sessionid, &context->target_ref, context->target_comm)) call_panic = 1; if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { audit_log_d_path(ab, "cwd=", &context->pwd); audit_log_end(ab); } } i = 0; list_for_each_entry(n, &context->names_list, list) { if (n->hidden) continue; audit_log_name(context, n, NULL, i++, &call_panic); } if (context->context == AUDIT_CTX_SYSCALL) audit_log_proctitle(); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); if (ab) audit_log_end(ab); if (call_panic) audit_panic("error in audit_log_exit()"); } /** * __audit_free - free a per-task audit context * @tsk: task whose audit context block to free * * Called from copy_process, do_exit, and the io_uring code */ void __audit_free(struct task_struct *tsk) { struct audit_context *context = tsk->audit_context; if (!context) return; /* this may generate CONFIG_CHANGE records */ if (!list_empty(&context->killed_trees)) audit_kill_trees(context); /* We are called either by do_exit() or the fork() error handling code; * in the former case tsk == current and in the latter tsk is a * random task_struct that doesn't have any meaningful data we * need to log via audit_log_exit(). */ if (tsk == current && !context->dummy) { context->return_valid = AUDITSC_INVALID; context->return_code = 0; if (context->context == AUDIT_CTX_SYSCALL) { audit_filter_syscall(tsk, context); audit_filter_inodes(tsk, context); if (context->current_state == AUDIT_STATE_RECORD) audit_log_exit(); } else if (context->context == AUDIT_CTX_URING) { /* TODO: verify this case is real and valid */ audit_filter_uring(tsk, context); audit_filter_inodes(tsk, context); if (context->current_state == AUDIT_STATE_RECORD) audit_log_uring(context); } } audit_set_context(tsk, NULL); audit_free_context(context); } /** * audit_return_fixup - fixup the return codes in the audit_context * @ctx: the audit_context * @success: true/false value to indicate if the operation succeeded or not * @code: operation return code * * We need to fixup the return code in the audit logs if the actual return * codes are later going to be fixed by the arch specific signal handlers. */ static void audit_return_fixup(struct audit_context *ctx, int success, long code) { /* * This is actually a test for: * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) * * but is faster than a bunch of || */ if (unlikely(code <= -ERESTARTSYS) && (code >= -ERESTART_RESTARTBLOCK) && (code != -ENOIOCTLCMD)) ctx->return_code = -EINTR; else ctx->return_code = code; ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE); } /** * __audit_uring_entry - prepare the kernel task's audit context for io_uring * @op: the io_uring opcode * * This is similar to audit_syscall_entry() but is intended for use by io_uring * operations. This function should only ever be called from * audit_uring_entry() as we rely on the audit context checking present in that * function. */ void __audit_uring_entry(u8 op) { struct audit_context *ctx = audit_context(); if (ctx->state == AUDIT_STATE_DISABLED) return; /* * NOTE: It's possible that we can be called from the process' context * before it returns to userspace, and before audit_syscall_exit() * is called. In this case there is not much to do, just record * the io_uring details and return. */ ctx->uring_op = op; if (ctx->context == AUDIT_CTX_SYSCALL) return; ctx->dummy = !audit_n_rules; if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD) ctx->prio = 0; ctx->context = AUDIT_CTX_URING; ctx->current_state = ctx->state; ktime_get_coarse_real_ts64(&ctx->stamp.ctime); } /** * __audit_uring_exit - wrap up the kernel task's audit context after io_uring * @success: true/false value to indicate if the operation succeeded or not * @code: operation return code * * This is similar to audit_syscall_exit() but is intended for use by io_uring * operations. This function should only ever be called from * audit_uring_exit() as we rely on the audit context checking present in that * function. */ void __audit_uring_exit(int success, long code) { struct audit_context *ctx = audit_context(); if (ctx->dummy) { if (ctx->context != AUDIT_CTX_URING) return; goto out; } audit_return_fixup(ctx, success, code); if (ctx->context == AUDIT_CTX_SYSCALL) { /* * NOTE: See the note in __audit_uring_entry() about the case * where we may be called from process context before we * return to userspace via audit_syscall_exit(). In this * case we simply emit a URINGOP record and bail, the * normal syscall exit handling will take care of * everything else. * It is also worth mentioning that when we are called, * the current process creds may differ from the creds * used during the normal syscall processing; keep that * in mind if/when we move the record generation code. */ /* * We need to filter on the syscall info here to decide if we * should emit a URINGOP record. I know it seems odd but this * solves the problem where users have a filter to block *all* * syscall records in the "exit" filter; we want to preserve * the behavior here. */ audit_filter_syscall(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) audit_filter_uring(current, ctx); audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) return; audit_log_uring(ctx); return; } /* this may generate CONFIG_CHANGE records */ if (!list_empty(&ctx->killed_trees)) audit_kill_trees(ctx); /* run through both filters to ensure we set the filterkey properly */ audit_filter_uring(current, ctx); audit_filter_inodes(current, ctx); if (ctx->current_state != AUDIT_STATE_RECORD) goto out; audit_log_exit(); out: audit_reset_context(ctx); } /** * __audit_syscall_entry - fill in an audit record at syscall entry * @major: major syscall type (function) * @a1: additional syscall register 1 * @a2: additional syscall register 2 * @a3: additional syscall register 3 * @a4: additional syscall register 4 * * Fill in audit context at syscall entry. This only happens if the * audit context was created when the task was created and the state or * filters demand the audit context be built. If the state from the * per-task filter or from the per-syscall filter is AUDIT_STATE_RECORD, * then the record will be written at syscall exit time (otherwise, it * will only be written if another part of the kernel requests that it * be written). */ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { struct audit_context *context = audit_context(); enum audit_state state; if (!audit_enabled || !context) return; WARN_ON(context->context != AUDIT_CTX_UNUSED); WARN_ON(context->name_count); if (context->context != AUDIT_CTX_UNUSED || context->name_count) { audit_panic("unrecoverable error in audit_syscall_entry()"); return; } state = context->state; if (state == AUDIT_STATE_DISABLED) return; context->dummy = !audit_n_rules; if (!context->dummy && state == AUDIT_STATE_BUILD) { context->prio = 0; if (auditd_test_task(current)) return; } context->arch = syscall_get_arch(current); context->major = major; context->argv[0] = a1; context->argv[1] = a2; context->argv[2] = a3; context->argv[3] = a4; context->context = AUDIT_CTX_SYSCALL; context->current_state = state; ktime_get_coarse_real_ts64(&context->stamp.ctime); } /** * __audit_syscall_exit - deallocate audit context after a system call * @success: success value of the syscall * @return_code: return value of the syscall * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_STATE_RECORD state from * filtering, or because some other part of the kernel wrote an audit * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ void __audit_syscall_exit(int success, long return_code) { struct audit_context *context = audit_context(); if (!context || context->dummy || context->context != AUDIT_CTX_SYSCALL) goto out; /* this may generate CONFIG_CHANGE records */ if (!list_empty(&context->killed_trees)) audit_kill_trees(context); audit_return_fixup(context, success, return_code); /* run through both filters to ensure we set the filterkey properly */ audit_filter_syscall(current, context); audit_filter_inodes(current, context); if (context->current_state != AUDIT_STATE_RECORD) goto out; audit_log_exit(); out: audit_reset_context(context); } static inline void handle_one(const struct inode *inode) { struct audit_context *context; struct audit_tree_refs *p; struct audit_chunk *chunk; int count; if (likely(!inode->i_fsnotify_marks)) return; context = audit_context(); p = context->trees; count = context->tree_count; rcu_read_lock(); chunk = audit_tree_lookup(inode); rcu_read_unlock(); if (!chunk) return; if (likely(put_tree_ref(context, chunk))) return; if (unlikely(!grow_tree_refs(context))) { pr_warn("out of memory, audit has lost a tree reference\n"); audit_set_auditable(context); audit_put_chunk(chunk); unroll_tree_refs(context, p, count); return; } put_tree_ref(context, chunk); } static void handle_path(const struct dentry *dentry) { struct audit_context *context; struct audit_tree_refs *p; const struct dentry *d, *parent; struct audit_chunk *drop; unsigned long seq; int count; context = audit_context(); p = context->trees; count = context->tree_count; retry: drop = NULL; d = dentry; rcu_read_lock(); seq = read_seqbegin(&rename_lock); for (;;) { struct inode *inode = d_backing_inode(d); if (inode && unlikely(inode->i_fsnotify_marks)) { struct audit_chunk *chunk; chunk = audit_tree_lookup(inode); if (chunk) { if (unlikely(!put_tree_ref(context, chunk))) { drop = chunk; break; } } } parent = d->d_parent; if (parent == d) break; d = parent; } if (unlikely(read_seqretry(&rename_lock, seq) || drop)) { /* in this order */ rcu_read_unlock(); if (!drop) { /* just a race with rename */ unroll_tree_refs(context, p, count); goto retry; } audit_put_chunk(drop); if (grow_tree_refs(context)) { /* OK, got more space */ unroll_tree_refs(context, p, count); goto retry; } /* too bad */ pr_warn("out of memory, audit has lost a tree reference\n"); unroll_tree_refs(context, p, count); audit_set_auditable(context); return; } rcu_read_unlock(); } static struct audit_names *audit_alloc_name(struct audit_context *context, unsigned char type) { struct audit_names *aname; if (context->name_count < AUDIT_NAMES) { aname = &context->preallocated_names[context->name_count]; memset(aname, 0, sizeof(*aname)); } else { aname = kzalloc(sizeof(*aname), GFP_NOFS); if (!aname) return NULL; aname->should_free = true; } aname->ino = AUDIT_INO_UNSET; aname->type = type; list_add_tail(&aname->list, &context->names_list); context->name_count++; if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); return aname; } /** * __audit_reusename - fill out filename with info from existing entry * @uptr: userland ptr to pathname * * Search the audit_names list for the current audit context. If there is an * existing entry with a matching "uptr" then return the filename * associated with that audit_name. If not, return NULL. */ struct filename * __audit_reusename(const __user char *uptr) { struct audit_context *context = audit_context(); struct audit_names *n; list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; if (n->name->uptr == uptr) return refname(n->name); } return NULL; } /** * __audit_getname - add a name to the list * @name: name to add * * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ void __audit_getname(struct filename *name) { struct audit_context *context = audit_context(); struct audit_names *n; if (context->context == AUDIT_CTX_UNUSED) return; n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; refname(name); } static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) { struct cpu_vfs_cap_data caps; int rc; if (!dentry) return 0; rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps); if (rc) return rc; name->fcap.permitted = caps.permitted; name->fcap.inheritable = caps.inheritable; name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); name->fcap.rootid = caps.rootid; name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; return 0; } /* Copy inode data into an audit_names. */ static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, struct inode *inode, unsigned int flags) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; name->mode = inode->i_mode; name->uid = inode->i_uid; name->gid = inode->i_gid; name->rdev = inode->i_rdev; security_inode_getlsmprop(inode, &name->oprop); if (flags & AUDIT_INODE_NOEVAL) { name->fcap_ver = -1; return; } audit_copy_fcaps(name, dentry); } /** * __audit_inode - store the inode and device from a lookup * @name: name being audited * @dentry: dentry being audited * @flags: attributes for this particular entry */ void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; struct audit_entry *e; struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; int i; if (context->context == AUDIT_CTX_UNUSED) return; rcu_read_lock(); list_for_each_entry_rcu(e, list, list) { for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; if (f->type == AUDIT_FSTYPE && audit_comparator(inode->i_sb->s_magic, f->op, f->val) && e->rule.action == AUDIT_NEVER) { rcu_read_unlock(); return; } } } rcu_read_unlock(); if (!name) goto out_alloc; /* * If we have a pointer to an audit_names entry already, then we can * just use it directly if the type is correct. */ n = name->aname; if (n) { if (parent) { if (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) goto out; } else { if (n->type != AUDIT_TYPE_PARENT) goto out; } } list_for_each_entry_reverse(n, &context->names_list, list) { if (n->ino) { /* valid inode number, use that for the comparison */ if (n->ino != inode->i_ino || n->dev != inode->i_sb->s_dev) continue; } else if (n->name) { /* inode number has not been set, check the name */ if (strcmp(n->name->name, name->name)) continue; } else /* no inode and no name (?!) ... this is odd ... */ continue; /* match the correct record type */ if (parent) { if (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) goto out; } else { if (n->type != AUDIT_TYPE_PARENT) goto out; } } out_alloc: /* unable to find an entry with both a matching name and type */ n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; if (name) { n->name = name; refname(name); } out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; n->type = AUDIT_TYPE_PARENT; if (flags & AUDIT_INODE_HIDDEN) n->hidden = true; } else { n->name_len = AUDIT_NAME_FULL; n->type = AUDIT_TYPE_NORMAL; } handle_path(dentry); audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL); } void __audit_file(const struct file *file) { __audit_inode(NULL, file->f_path.dentry, 0); } /** * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent * @dentry: dentry being audited * @type: AUDIT_TYPE_* value that we're looking for * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. * This call updates the audit context with the child's information. * Syscalls that create a new filesystem object must be hooked after * the object is created. Syscalls that remove a filesystem object * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); const struct qstr *dname = &dentry->d_name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; struct audit_entry *e; struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; int i; if (context->context == AUDIT_CTX_UNUSED) return; rcu_read_lock(); list_for_each_entry_rcu(e, list, list) { for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; if (f->type == AUDIT_FSTYPE && audit_comparator(parent->i_sb->s_magic, f->op, f->val) && e->rule.action == AUDIT_NEVER) { rcu_read_unlock(); return; } } } rcu_read_unlock(); if (inode) handle_one(inode); list_for_each_entry(n, &context->names_list, list) { /* can only match entries that have a name */ if (!n->name) continue; /* look for a parent entry first */ if (!found_parent && (n->type == AUDIT_TYPE_PARENT || n->type == AUDIT_TYPE_UNKNOWN) && (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev && !audit_compare_dname_path(dname, n->name->name, n->name_len))) { n->type = AUDIT_TYPE_PARENT; found_parent = n; if (found_child) break; continue; } /* is there a matching child entry? */ if (!found_child && (n->type == type || n->type == AUDIT_TYPE_UNKNOWN) && (!strcmp(dname->name, n->name->name) || !audit_compare_dname_path(dname, n->name->name, found_parent ? found_parent->name_len : AUDIT_NAME_FULL))) { if (n->type == AUDIT_TYPE_UNKNOWN) n->type = type; found_child = n; if (found_parent) break; } } if (!found_parent) { /* create a new, "anonymous" parent record */ n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; audit_copy_inode(n, NULL, parent, 0); } if (!found_child) { found_child = audit_alloc_name(context, type); if (!found_child) return; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; refname(found_child->name); } } if (inode) audit_copy_inode(found_child, dentry, inode, 0); else found_child->ino = AUDIT_INO_UNSET; } EXPORT_SYMBOL_GPL(__audit_inode_child); /** * auditsc_get_stamp - get local copies of audit_context values * @ctx: audit_context for the task * @stamp: timestamp to record * * Also sets the context as auditable. */ int auditsc_get_stamp(struct audit_context *ctx, struct audit_stamp *stamp) { if (ctx->context == AUDIT_CTX_UNUSED) return 0; if (!ctx->stamp.serial) ctx->stamp.serial = audit_serial(); *stamp = ctx->stamp; if (!ctx->prio) { ctx->prio = 1; ctx->current_state = AUDIT_STATE_RECORD; } return 1; } /** * __audit_mq_open - record audit data for a POSIX MQ open * @oflag: open flag * @mode: mode bits * @attr: queue attributes * */ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { struct audit_context *context = audit_context(); if (attr) memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); else memset(&context->mq_open.attr, 0, sizeof(struct mq_attr)); context->mq_open.oflag = oflag; context->mq_open.mode = mode; context->type = AUDIT_MQ_OPEN; } /** * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive * @mqdes: MQ descriptor * @msg_len: Message length * @msg_prio: Message priority * @abs_timeout: Message timeout in absolute time * */ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { struct audit_context *context = audit_context(); struct timespec64 *p = &context->mq_sendrecv.abs_timeout; if (abs_timeout) memcpy(p, abs_timeout, sizeof(*p)); else memset(p, 0, sizeof(*p)); context->mq_sendrecv.mqdes = mqdes; context->mq_sendrecv.msg_len = msg_len; context->mq_sendrecv.msg_prio = msg_prio; context->type = AUDIT_MQ_SENDRECV; } /** * __audit_mq_notify - record audit data for a POSIX MQ notify * @mqdes: MQ descriptor * @notification: Notification event * */ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { struct audit_context *context = audit_context(); if (notification) context->mq_notify.sigev_signo = notification->sigev_signo; else context->mq_notify.sigev_signo = 0; context->mq_notify.mqdes = mqdes; context->type = AUDIT_MQ_NOTIFY; } /** * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute * @mqdes: MQ descriptor * @mqstat: MQ flags * */ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { struct audit_context *context = audit_context(); context->mq_getsetattr.mqdes = mqdes; context->mq_getsetattr.mqstat = *mqstat; context->type = AUDIT_MQ_GETSETATTR; } /** * __audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions * */ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) { struct audit_context *context = audit_context(); context->ipc.uid = ipcp->uid; context->ipc.gid = ipcp->gid; context->ipc.mode = ipcp->mode; context->ipc.has_perm = 0; security_ipc_getlsmprop(ipcp, &context->ipc.oprop); context->type = AUDIT_IPC; } /** * __audit_ipc_set_perm - record audit data for new ipc permissions * @qbytes: msgq bytes * @uid: msgq user id * @gid: msgq group id * @mode: msgq mode (permissions) * * Called only after audit_ipc_obj(). */ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { struct audit_context *context = audit_context(); context->ipc.qbytes = qbytes; context->ipc.perm_uid = uid; context->ipc.perm_gid = gid; context->ipc.perm_mode = mode; context->ipc.has_perm = 1; } void __audit_bprm(struct linux_binprm *bprm) { struct audit_context *context = audit_context(); context->type = AUDIT_EXECVE; context->execve.argc = bprm->argc; } /** * __audit_socketcall - record audit data for sys_socketcall * @nargs: number of args, which should not be more than AUDITSC_ARGS. * @args: args array * */ int __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = audit_context(); if (nargs <= 0 || nargs > AUDITSC_ARGS || !args) return -EINVAL; context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); return 0; } /** * __audit_fd_pair - record audit data for pipe and socketpair * @fd1: the first file descriptor * @fd2: the second file descriptor * */ void __audit_fd_pair(int fd1, int fd2) { struct audit_context *context = audit_context(); context->fds[0] = fd1; context->fds[1] = fd2; } /** * __audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto * @len: data length in user space * @a: data address in kernel space * * Returns 0 for success or NULL context or < 0 on error. */ int __audit_sockaddr(int len, void *a) { struct audit_context *context = audit_context(); if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); if (!p) return -ENOMEM; context->sockaddr = p; } context->sockaddr_len = len; memcpy(context->sockaddr, a, len); return 0; } void __audit_ptrace(struct task_struct *t) { struct audit_context *context = audit_context(); context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); strscpy(context->target_comm, t->comm); security_task_getlsmprop_obj(t, &context->target_ref); } /** * audit_signal_info_syscall - record signal info for syscalls * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ int audit_signal_info_syscall(struct task_struct *t) { struct audit_aux_data_pids *axp; struct audit_context *ctx = audit_context(); kuid_t t_uid = task_uid(t); if (!audit_signals || audit_dummy_context()) return 0; /* optimize the common case by putting first signal recipient directly * in audit_context */ if (!ctx->target_pid) { ctx->target_pid = task_tgid_nr(t); ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); strscpy(ctx->target_comm, t->comm); security_task_getlsmprop_obj(t, &ctx->target_ref); return 0; } axp = (void *)ctx->aux_pids; if (!axp || axp->pid_count == AUDIT_AUX_PIDS) { axp = kzalloc(sizeof(*axp), GFP_ATOMIC); if (!axp) return -ENOMEM; axp->d.type = AUDIT_OBJ_PID; axp->d.next = ctx->aux_pids; ctx->aux_pids = (void *)axp; } BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); axp->target_pid[axp->pid_count] = task_tgid_nr(t); axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); security_task_getlsmprop_obj(t, &axp->target_ref[axp->pid_count]); strscpy(axp->target_comm[axp->pid_count], t->comm); axp->pid_count++; return 0; } /** * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps * @bprm: pointer to the bprm being processed * @new: the proposed new credentials * @old: the old credentials * * Simply check if the proc already has the caps given by the file and if not * store the priv escalation info for later auditing at the end of the syscall * * -Eric */ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { struct audit_aux_data_bprm_fcaps *ax; struct audit_context *context = audit_context(); struct cpu_vfs_cap_data vcaps; ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; ax->d.type = AUDIT_BPRM_FCAPS; ax->d.next = context->aux; context->aux = (void *)ax; get_vfs_caps_from_disk(&nop_mnt_idmap, bprm->file->f_path.dentry, &vcaps); ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); ax->fcap.rootid = vcaps.rootid; ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; ax->old_pcap.permitted = old->cap_permitted; ax->old_pcap.inheritable = old->cap_inheritable; ax->old_pcap.effective = old->cap_effective; ax->old_pcap.ambient = old->cap_ambient; ax->new_pcap.permitted = new->cap_permitted; ax->new_pcap.inheritable = new->cap_inheritable; ax->new_pcap.effective = new->cap_effective; ax->new_pcap.ambient = new->cap_ambient; return 0; } /** * __audit_log_capset - store information about the arguments to the capset syscall * @new: the new credentials * @old: the old (current) credentials * * Record the arguments userspace sent to sys_capset for later printing by the * audit system if applicable */ void __audit_log_capset(const struct cred *new, const struct cred *old) { struct audit_context *context = audit_context(); context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; context->capset.cap.ambient = new->cap_ambient; context->type = AUDIT_CAPSET; } void __audit_mmap_fd(int fd, int flags) { struct audit_context *context = audit_context(); context->mmap.fd = fd; context->mmap.flags = flags; context->type = AUDIT_MMAP; } void __audit_openat2_how(struct open_how *how) { struct audit_context *context = audit_context(); context->openat2.flags = how->flags; context->openat2.mode = how->mode; context->openat2.resolve = how->resolve; context->type = AUDIT_OPENAT2; } void __audit_log_kern_module(const char *name) { struct audit_context *context = audit_context(); context->module.name = kstrdup(name, GFP_KERNEL); if (!context->module.name) audit_log_lost("out of memory in __audit_log_kern_module"); context->type = AUDIT_KERN_MODULE; } void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */ switch (friar->hdr.type) { case FAN_RESPONSE_INFO_NONE: audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2", response, FAN_RESPONSE_INFO_NONE); break; case FAN_RESPONSE_INFO_AUDIT_RULE: audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u", response, friar->hdr.type, friar->rule_number, friar->subj_trust, friar->obj_trust); } } void __audit_tk_injoffset(struct timespec64 offset) { struct audit_context *context = audit_context(); /* only set type if not already set by NTP */ if (!context->type) context->type = AUDIT_TIME_INJOFFSET; memcpy(&context->time.tk_injoffset, &offset, sizeof(offset)); } void __audit_ntp_log(const struct audit_ntp_data *ad) { struct audit_context *context = audit_context(); int type; for (type = 0; type < AUDIT_NTP_NVALS; type++) if (ad->vals[type].newval != ad->vals[type].oldval) { /* unconditionally set type, overwriting TK */ context->type = AUDIT_TIME_ADJNTPVAL; memcpy(&context->time.ntp_data, ad, sizeof(*ad)); break; } } void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { struct audit_buffer *ab; char comm[sizeof(current->comm)]; ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG); if (!ab) return; audit_log_format(ab, "table=%s family=%u entries=%u op=%s", name, af, nentries, audit_nfcfgs[op].s); audit_log_format(ab, " pid=%u", task_tgid_nr(current)); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_end(ab); } EXPORT_SYMBOL_GPL(__audit_log_nfcfg); static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; kgid_t gid; unsigned int sessionid; char comm[sizeof(current->comm)]; auid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); current_uid_gid(&uid, &gid); audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), from_kgid(&init_user_ns, gid), sessionid); audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_d_path_exe(ab, current->mm); } /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value * * If a process ends with a core dump, something fishy is going on and we * should record the event for investigation. */ void audit_core_dumps(long signr) { struct audit_buffer *ab; if (!audit_enabled) return; if (signr == SIGQUIT) /* don't care for those */ return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_ABEND); if (unlikely(!ab)) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld res=1", signr); audit_log_end(ab); } /** * audit_seccomp - record information about a seccomp action * @syscall: syscall number * @signr: signal value * @code: the seccomp action * * Record the information associated with a seccomp action. Event filtering for * seccomp actions that are not to be logged is done in seccomp_log(). * Therefore, this function forces auditing independent of the audit_enabled * and dummy context state because seccomp actions should be logged even when * audit is not in use. */ void audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_SECCOMP); if (unlikely(!ab)) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", signr, syscall_get_arch(current), syscall, in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } void audit_seccomp_actions_logged(const char *names, const char *old_names, int res) { struct audit_buffer *ab; if (!audit_enabled) return; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_format(ab, "op=seccomp-logging actions=%s old-actions=%s res=%d", names, old_names, res); audit_log_end(ab); } struct list_head *audit_killed_trees(void) { struct audit_context *ctx = audit_context(); if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED)) return NULL; return &ctx->killed_trees; }
4 12 12 1 1 1 4 1 1 2 2 2 2 2 2 2 1 2 2 2 2 979 11 2204 2205 2209 1 6 6 13 1 1 1 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 /* SPDX-License-Identifier: GPL-2.0 */ /* * Filesystem access notification for Linux * * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #ifndef __LINUX_FSNOTIFY_BACKEND_H #define __LINUX_FSNOTIFY_BACKEND_H #ifdef __KERNEL__ #include <linux/idr.h> /* inotify uses this */ #include <linux/fs.h> /* struct inode */ #include <linux/list.h> #include <linux/path.h> /* struct path */ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/mempool.h> #include <linux/sched/mm.h> /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily * convert between them. dnotify only needs conversion at watch creation * so no perf loss there. fanotify isn't defined yet, so it can use the * wholes if it needs more events. */ #define FS_ACCESS 0x00000001 /* File was accessed */ #define FS_MODIFY 0x00000002 /* File was modified */ #define FS_ATTRIB 0x00000004 /* Metadata changed */ #define FS_CLOSE_WRITE 0x00000008 /* Writable file was closed */ #define FS_CLOSE_NOWRITE 0x00000010 /* Unwritable file closed */ #define FS_OPEN 0x00000020 /* File was opened */ #define FS_MOVED_FROM 0x00000040 /* File was moved from X */ #define FS_MOVED_TO 0x00000080 /* File was moved to Y */ #define FS_CREATE 0x00000100 /* Subfile was created */ #define FS_DELETE 0x00000200 /* Subfile was deleted */ #define FS_DELETE_SELF 0x00000400 /* Self was deleted */ #define FS_MOVE_SELF 0x00000800 /* Self was moved */ #define FS_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ #define FS_ERROR 0x00008000 /* Filesystem Error (fanotify) */ /* * FS_IN_IGNORED overloads FS_ERROR. It is only used internally by inotify * which does not support FS_ERROR. */ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ /* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ #define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ #define FS_MNT_ATTACH 0x01000000 /* Mount was attached */ #define FS_MNT_DETACH 0x02000000 /* Mount was detached */ #define FS_MNT_MOVE (FS_MNT_ATTACH | FS_MNT_DETACH) /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. * Set on inode/sb/mount marks that care about parent/name info. */ #define FS_EVENT_ON_CHILD 0x08000000 #define FS_RENAME 0x10000000 /* File was renamed */ #define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */ #define FS_ISDIR 0x40000000 /* event occurred against dir */ #define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) /* * Directory entry modification events - reported only to directory * where entry is modified and not to a watching parent. * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event * when a directory entry inside a child subdir changes. */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) /* Mount namespace events */ #define FSNOTIFY_MNT_EVENTS (FS_MNT_ATTACH | FS_MNT_DETACH) /* Content events can be used to inspect file content */ #define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ FS_ACCESS_PERM) /* Pre-content events can be used to fill file content */ #define FSNOTIFY_PRE_CONTENT_EVENTS (FS_PRE_ACCESS) #define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \ FSNOTIFY_PRE_CONTENT_EVENTS) /* * This is a list of all events that may get sent to a parent that is watching * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory. */ #define FS_EVENTS_POSS_ON_CHILD (ALL_FSNOTIFY_PERM_EVENTS | \ FS_ACCESS | FS_MODIFY | FS_ATTRIB | \ FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \ FS_OPEN | FS_OPEN_EXEC) /* * This is a list of all events that may get sent with the parent inode as the * @to_tell argument of fsnotify(). * It may include events that can be sent to an inode/sb/mount mark, but cannot * be sent to a parent watching children. */ #define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD) /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FSNOTIFY_MNT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \ FS_ERROR) /* Extra flags that may be reported with event or control handling of events */ #define ALL_FSNOTIFY_FLAGS (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT) #define ALL_FSNOTIFY_BITS (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS) struct fsnotify_group; struct fsnotify_event; struct fsnotify_mark; struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; struct mem_cgroup; /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. * * handle_event - main call for a group to handle an fs event * @group: group to notify * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @cookie: inotify rename cookie * @iter_info: array of marks from this group that are interested in the event * * handle_inode_event - simple variant of handle_event() for groups that only * have inode marks and don't have ignore mask * @mark: mark to notify * @mask: event type and flags * @inode: inode that event happened on * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to. * Either @inode or @dir must be non-NULL. * @file_name: optional file name associated with event * @cookie: inotify rename cookie * * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group * MUST be holding a reference on each mark and that reference must be * dropped in this function. inotify uses this function to send * userspace messages that marks have been removed. */ struct fsnotify_ops { int (*handle_event)(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event); /* called on final put+free to free memory */ void (*free_mark)(struct fsnotify_mark *mark); }; /* * all of the information about the original object we want to now send to * a group. If you want to carry more info from the accessing task to the * listener this structure is where you need to be adding fields. */ struct fsnotify_event { struct list_head list; }; /* * fsnotify group priorities. * Events are sent in order from highest priority to lowest priority. */ enum fsnotify_group_prio { FSNOTIFY_PRIO_NORMAL = 0, /* normal notifiers, no permissions */ FSNOTIFY_PRIO_CONTENT, /* fanotify permission events */ FSNOTIFY_PRIO_PRE_CONTENT, /* fanotify pre-content events */ __FSNOTIFY_PRIO_NUM }; /* * A group is a "thing" that wants to receive notification about filesystem * events. The mask holds the subset of event types this group cares about. * refcnt on a group is up to the implementor and at any moment if it goes 0 * everything will be cleaned up. */ struct fsnotify_group { const struct fsnotify_ops *ops; /* how this group handles things */ /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. * As an example, the dnotify group will always have a refcnt=1 and that * will never change. Inotify, on the other hand, has a group per * inotify_init() and the refcnt will hit 0 only when that fd has been * closed. */ refcount_t refcnt; /* things with interest in this group */ /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ unsigned int q_len; /* events on the queue */ unsigned int max_events; /* maximum events allowed on the list */ enum fsnotify_group_prio priority; /* priority for sending events */ bool shutdown; /* group is being shut down, don't queue more events */ #define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */ #define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */ int flags; unsigned int owner_flags; /* stored flags of mark_mutex owner */ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ atomic_t user_waits; /* Number of tasks waiting for user * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ struct mem_cgroup *memcg; /* memcg to charge allocations */ struct user_namespace *user_ns; /* user ns where group was created */ /* groups can define private fields here or use the void *private */ union { void *private; #ifdef CONFIG_INOTIFY_USER struct inotify_group_private_data { spinlock_t idr_lock; struct idr idr; struct ucounts *ucounts; } inotify_data; #endif #ifdef CONFIG_FANOTIFY struct fanotify_group_private_data { /* Hash table of events for merge */ struct hlist_head *merge_hash; /* allows a group to block waiting for a userspace response */ struct list_head access_list; wait_queue_head_t access_waitq; int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; mempool_t error_events_pool; /* chained on perm_group_list */ struct list_head perm_grp_list; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; }; /* * These helpers are used to prevent deadlock when reclaiming inodes with * evictable marks of the same group that is allocating a new mark. */ static inline void fsnotify_group_lock(struct fsnotify_group *group) { mutex_lock(&group->mark_mutex); group->owner_flags = memalloc_nofs_save(); } static inline void fsnotify_group_unlock(struct fsnotify_group *group) { memalloc_nofs_restore(group->owner_flags); mutex_unlock(&group->mark_mutex); } static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) { WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, FSNOTIFY_EVENT_FILE_RANGE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, FSNOTIFY_EVENT_MNT, FSNOTIFY_EVENT_ERROR, }; struct fs_error_report { int error; struct inode *inode; struct super_block *sb; }; struct file_range { const struct path *path; loff_t pos; size_t count; }; static inline const struct path *file_range_path(const struct file_range *range) { return range->path; } struct fsnotify_mnt { const struct mnt_namespace *ns; u64 mnt_id; }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return (struct inode *)data; case FSNOTIFY_EVENT_DENTRY: return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); case FSNOTIFY_EVENT_FILE_RANGE: return d_inode(file_range_path(data)->dentry); case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *)data)->inode; default: return NULL; } } static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_DENTRY: /* Non const is needed for dget() */ return (struct dentry *)data; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry; case FSNOTIFY_EVENT_FILE_RANGE: return file_range_path(data)->dentry; default: return NULL; } } static inline const struct path *fsnotify_data_path(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; case FSNOTIFY_EVENT_FILE_RANGE: return file_range_path(data); default: return NULL; } } static inline struct super_block *fsnotify_data_sb(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return ((struct inode *)data)->i_sb; case FSNOTIFY_EVENT_DENTRY: return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; case FSNOTIFY_EVENT_FILE_RANGE: return file_range_path(data)->dentry->d_sb; case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *) data)->sb; default: return NULL; } } static inline const struct fsnotify_mnt *fsnotify_data_mnt(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_MNT: return data; default: return NULL; } } static inline u64 fsnotify_data_mnt_id(const void *data, int data_type) { const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); return mnt_data ? mnt_data->mnt_id : 0; } static inline struct fs_error_report *fsnotify_data_error_report( const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_ERROR: return (struct fs_error_report *) data; default: return NULL; } } static inline const struct file_range *fsnotify_data_file_range( const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_FILE_RANGE: return (struct file_range *)data; default: return NULL; } } /* * Index to merged marks iterator array that correlates to a type of watch. * The type of watched object can be deduced from the iterator type, but not * the other way around, because an event can match different watched objects * of the same object type. * For example, both parent and child are watching an object of type inode. */ enum fsnotify_iter_type { FSNOTIFY_ITER_TYPE_INODE, FSNOTIFY_ITER_TYPE_VFSMOUNT, FSNOTIFY_ITER_TYPE_SB, FSNOTIFY_ITER_TYPE_PARENT, FSNOTIFY_ITER_TYPE_INODE2, FSNOTIFY_ITER_TYPE_MNTNS, FSNOTIFY_ITER_TYPE_COUNT }; /* The type of object that a mark is attached to */ enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_ANY = -1, FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, FSNOTIFY_OBJ_TYPE_MNTNS, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; static inline bool fsnotify_valid_obj_type(unsigned int obj_type) { return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT); } struct fsnotify_iter_info { struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT]; struct fsnotify_group *current_group; unsigned int report_mask; int srcu_idx; }; static inline bool fsnotify_iter_should_report_type( struct fsnotify_iter_info *iter_info, int iter_type) { return (iter_info->report_mask & (1U << iter_type)); } static inline void fsnotify_iter_set_report_type( struct fsnotify_iter_info *iter_info, int iter_type) { iter_info->report_mask |= (1U << iter_type); } static inline struct fsnotify_mark *fsnotify_iter_mark( struct fsnotify_iter_info *iter_info, int iter_type) { if (fsnotify_iter_should_report_type(iter_info, iter_type)) return iter_info->marks[iter_type]; return NULL; } static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type, struct fsnotify_mark **markp) { while (type < FSNOTIFY_ITER_TYPE_COUNT) { *markp = fsnotify_iter_mark(iter, type); if (*markp) break; type++; } return type; } #define FSNOTIFY_ITER_FUNCS(name, NAME) \ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \ struct fsnotify_iter_info *iter_info) \ { \ return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \ } FSNOTIFY_ITER_FUNCS(inode, INODE) FSNOTIFY_ITER_FUNCS(parent, PARENT) FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT) FSNOTIFY_ITER_FUNCS(sb, SB) #define fsnotify_foreach_iter_type(type) \ for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++) #define fsnotify_foreach_iter_mark_type(iter, mark, type) \ for (type = 0; \ type = fsnotify_iter_step(iter, type, &mark), \ type < FSNOTIFY_ITER_TYPE_COUNT; \ type++) /* * Inode/vfsmount/sb point to this structure which tracks all marks attached to * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this * structure. We destroy this structure when there are no more marks attached * to it. The structure is protected by fsnotify_mark_srcu. */ struct fsnotify_mark_connector { spinlock_t lock; unsigned char type; /* Type of object [lock] */ unsigned char prio; /* Highest priority group */ #define FSNOTIFY_CONN_FLAG_IS_WATCHED 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ union { /* Object pointer [lock] */ void *obj; /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; struct hlist_head list; }; /* * Container for per-sb fsnotify state (sb marks and more). * Attached lazily on first marked object on the sb and freed when killing sb. */ struct fsnotify_sb_info { struct fsnotify_mark_connector __rcu *sb_marks; /* * Number of inode/mount/sb objects that are being watched in this sb. * Note that inodes objects are currently double-accounted. * * The value in watched_objects[prio] is the number of objects that are * watched by groups of priority >= prio, so watched_objects[0] is the * total number of watched objects in this sb. */ atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM]; }; static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb) { #ifdef CONFIG_FSNOTIFY return READ_ONCE(sb->s_fsnotify_info); #else return NULL; #endif } static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb) { return &fsnotify_sb_info(sb)->watched_objects[0]; } /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events * of a type matching mask or only interested in those events. * * These are flushed when an inode is evicted from core and may be flushed * when the inode is modified (as seen by fsnotify_access). Some fsnotify * users (such as dnotify) will flush these when the open fd is closed and not * at inode eviction or modification. * * Text in brackets is showing the lock(s) protecting modifications of a * particular entry. obj_lock means either inode->i_lock or * mnt->mnt_root->d_lock depending on the mark type. */ struct fsnotify_mark { /* Mask this mark is for [mark->lock, group->mark_mutex] */ __u32 mask; /* We hold one for presence in g_list. Also one ref for each 'thing' * in kernel that found and may be using this mark. */ refcount_t refcnt; /* Group this mark is for. Set on mark creation, stable until last ref * is dropped */ struct fsnotify_group *group; /* List of marks by group->marks_list. Also reused for queueing * mark into destroy_list when it's waiting for the end of SRCU period * before it can be freed. [group->mark_mutex] */ struct list_head g_list; /* Protects inode / mnt pointers, flags, masks */ spinlock_t lock; /* List of marks for inode / vfsmount [connector->lock, mark ref] */ struct hlist_node obj_list; /* Head of list of marks for an object [mark ref] */ struct fsnotify_mark_connector *connector; /* Events types and flags to ignore [mark->lock, group->mark_mutex] */ __u32 ignore_mask; /* General fsnotify mark flags */ #define FSNOTIFY_MARK_FLAG_ALIVE 0x0001 #define FSNOTIFY_MARK_FLAG_ATTACHED 0x0002 /* inotify mark flags */ #define FSNOTIFY_MARK_FLAG_EXCL_UNLINK 0x0010 #define FSNOTIFY_MARK_FLAG_IN_ONESHOT 0x0020 /* fanotify mark flags */ #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 #define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 #define FSNOTIFY_MARK_FLAG_WEAK_FSID 0x1000 unsigned int flags; /* flags [mark->lock] */ }; #ifdef CONFIG_FSNOTIFY /* called from the vfs helpers */ /* main fsnotify call to send events */ extern int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie); extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type); extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); extern void __fsnotify_mntns_delete(struct mnt_namespace *mntns); extern void fsnotify_sb_free(struct super_block *sb); extern u32 fsnotify_get_cookie(void); extern void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */ if (!(mask & FS_EVENT_ON_CHILD)) return 0; /* * This object might be watched by a mark that cares about parent/name * info, does it care about the specific set of events that can be * reported with parent/name info? */ return mask & FS_EVENTS_POSS_TO_PARENT; } static inline int fsnotify_inode_watches_children(struct inode *inode) { __u32 parent_mask = READ_ONCE(inode->i_fsnotify_mask); /* FS_EVENT_ON_CHILD is set if the inode may care */ if (!(parent_mask & FS_EVENT_ON_CHILD)) return 0; /* this inode might care about child events, does it care about the * specific set of events that can happen on a child? */ return parent_mask & FS_EVENTS_POSS_ON_CHILD; } /* * Update the dentry with a flag indicating the interest of its parent to receive * filesystem events when those events happens to this dentry->d_inode. */ static inline void fsnotify_update_flags(struct dentry *dentry) { assert_spin_locked(&dentry->d_lock); /* * Serialisation of setting PARENT_WATCHED on the dentries is provided * by d_lock. If inotify_inode_watched changes after we have taken * d_lock, the following fsnotify_set_children_dentry_flags call will * find our entry, so it will spin until we complete here, and update * us with the new state. */ if (fsnotify_inode_watches_children(dentry->d_parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; } /* called from fsnotify listeners, such as fanotify or dnotify */ /* create a new group */ extern struct fsnotify_group *fsnotify_alloc_group( const struct fsnotify_ops *ops, int flags); /* get reference to a group */ extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ extern void fsnotify_put_group(struct fsnotify_group *group); /* group destruction begins, stop queuing new events */ extern void fsnotify_group_stop_queueing(struct fsnotify_group *group); /* destroy group */ extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ extern int fsnotify_fasync(int fd, struct file *file, int on); /* Free event from memory */ extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ extern int fsnotify_insert_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *), void (*insert)(struct fsnotify_group *, struct fsnotify_event *)); static inline int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct fsnotify_group *, struct fsnotify_event *)) { return fsnotify_insert_event(group, event, merge, NULL); } /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { fsnotify_add_event(group, group->overflow_event, NULL); } static inline bool fsnotify_is_overflow_event(u32 mask) { return mask & FS_Q_OVERFLOW; } static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) { assert_spin_locked(&group->notification_lock); return list_empty(&group->notification_list); } extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); /* return AND dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group); /* Remove event queued in the notification list */ extern void fsnotify_remove_queued_event(struct fsnotify_group *group, struct fsnotify_event *event); /* functions used to manipulate the marks attached to inodes */ /* * Canonical "ignore mask" including event flags. * * Note the subtle semantic difference from the legacy ->ignored_mask. * ->ignored_mask traditionally only meant which events should be ignored, * while ->ignore_mask also includes flags regarding the type of objects on * which events should be ignored. */ static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark) { __u32 ignore_mask = mark->ignore_mask; /* The event flags in ignore mask take effect */ if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) return ignore_mask; /* * Legacy behavior: * - Always ignore events on dir * - Ignore events on child if parent is watching children */ ignore_mask |= FS_ISDIR; ignore_mask &= ~FS_EVENT_ON_CHILD; ignore_mask |= mark->mask & FS_EVENT_ON_CHILD; return ignore_mask; } /* Legacy ignored_mask - only event types to ignore */ static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark) { return mark->ignore_mask & ALL_FSNOTIFY_EVENTS; } /* * Check if mask (or ignore mask) should be applied depending if victim is a * directory and whether it is reported to a watching parent. */ static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir, int iter_type) { /* Should mask be applied to a directory? */ if (is_dir && !(mask & FS_ISDIR)) return false; /* Should mask be applied to a child? */ if (iter_type == FSNOTIFY_ITER_TYPE_PARENT && !(mask & FS_EVENT_ON_CHILD)) return false; return true; } /* * Effective ignore mask taking into account if event victim is a * directory and whether it is reported to a watching parent. */ static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark, bool is_dir, int iter_type) { __u32 ignore_mask = fsnotify_ignored_events(mark); if (!ignore_mask) return 0; /* For non-dir and non-child, no need to consult the event flags */ if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT) return ignore_mask; ignore_mask = fsnotify_ignore_mask(mark); if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type)) return 0; return ignore_mask & ALL_FSNOTIFY_EVENTS; } /* Get mask for calculating object interest taking ignore mask into account */ static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark) { __u32 mask = mark->mask; if (!fsnotify_ignored_events(mark)) return mask; /* Interest in FS_MODIFY may be needed for clearing ignore mask */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mask |= FS_MODIFY; /* * If mark is interested in ignoring events on children, the object must * show interest in those events for fsnotify_parent() to notice it. */ return mask | mark->ignore_mask; } /* Get mask of events for a list of marks */ extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn); /* Calculate mask of events for a list of marks */ extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); extern void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* Find mark belonging to given group in the list of marks */ struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type, struct fsnotify_group *group); /* attach the mark to the object */ int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags); int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj, unsigned int obj_type, int add_flags); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline struct fsnotify_mark *fsnotify_find_inode_mark( struct inode *inode, struct fsnotify_group *group) { return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group); } /* given a group and a mark, flag mark to be freed when all references are dropped */ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* detach mark from inode / mount list, group list, drop inode reference */ extern void fsnotify_detach_mark(struct fsnotify_mark *mark); /* free mark */ extern void fsnotify_free_mark(struct fsnotify_mark *mark); /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); /* Clear all of the marks of a group attached to a given object type */ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type); extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); static inline void fsnotify_init_event(struct fsnotify_event *event) { INIT_LIST_HEAD(&event->list); } int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count); #else static inline int fsnotify_pre_content(const struct path *path, const loff_t *ppos, size_t count) { return 0; } static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) { return 0; } static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { return 0; } static inline void __fsnotify_inode_delete(struct inode *inode) {} static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) {} static inline void fsnotify_sb_delete(struct super_block *sb) {} static inline void __fsnotify_mntns_delete(struct mnt_namespace *mntns) {} static inline void fsnotify_sb_free(struct super_block *sb) {} static inline void fsnotify_update_flags(struct dentry *dentry) {} static inline u32 fsnotify_get_cookie(void) { return 0; } static inline void fsnotify_unmount_inodes(struct super_block *sb) {} static inline void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) {} #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ #endif /* __LINUX_FSNOTIFY_BACKEND_H */
32 1 19 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IVERSION_H #define _LINUX_IVERSION_H #include <linux/fs.h> /* * The inode->i_version field: * --------------------------- * The change attribute (i_version) is mandated by NFSv4 and is mostly for * knfsd, but is also used for other purposes (e.g. IMA). The i_version must * appear larger to observers if there was an explicit change to the inode's * data or metadata since it was last queried. * * An explicit change is one that would ordinarily result in a change to the * inode status change time (aka ctime). i_version must appear to change, even * if the ctime does not (since the whole point is to avoid missing updates due * to timestamp granularity). If POSIX or other relevant spec mandates that the * ctime must change due to an operation, then the i_version counter must be * incremented as well. * * Making the i_version update completely atomic with the operation itself would * be prohibitively expensive. Traditionally the kernel has updated the times on * directories after an operation that changes its contents. For regular files, * the ctime is usually updated before the data is copied into the cache for a * write. This means that there is a window of time when an observer can * associate a new timestamp with old file contents. Since the purpose of the * i_version is to allow for better cache coherency, the i_version must always * be updated after the results of the operation are visible. Updating it before * and after a change is also permitted. (Note that no filesystems currently do * this. Fixing that is a work-in-progress). * * Observers see the i_version as a 64-bit number that never decreases. If it * remains the same since it was last checked, then nothing has changed in the * inode. If it's different then something has changed. Observers cannot infer * anything about the nature or magnitude of the changes from the value, only * that the inode has changed in some fashion. * * Not all filesystems properly implement the i_version counter. Subsystems that * want to use i_version field on an inode should first check whether the * filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro). * * Those that set SB_I_VERSION will automatically have their i_version counter * incremented on writes to normal files. If the SB_I_VERSION is not set, then * the VFS will not touch it on writes, and the filesystem can use it how it * wishes. Note that the filesystem is always responsible for updating the * i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.). * We consider these sorts of filesystems to have a kernel-managed i_version. * * It may be impractical for filesystems to keep i_version updates atomic with * respect to the changes that cause them. They should, however, guarantee * that i_version updates are never visible before the changes that caused * them. Also, i_version updates should never be delayed longer than it takes * the original change to reach disk. * * This implementation uses the low bit in the i_version field as a flag to * track when the value has been queried. If it has not been queried since it * was last incremented, we can skip the increment in most cases. * * In the event that we're updating the ctime, we will usually go ahead and * bump the i_version anyway. Since that has to go to stable storage in some * fashion, we might as well increment it as well. * * With this implementation, the value should always appear to observers to * increase over time if the file has changed. It's recommended to use * inode_eq_iversion() helper to compare values. * * Note that some filesystems (e.g. NFS and AFS) just use the field to store * a server-provided value (for the most part). For that reason, those * filesystems do not set SB_I_VERSION. These filesystems are considered to * have a self-managed i_version. * * Persistently storing the i_version * ---------------------------------- * Queries of the i_version field are not gated on them hitting the backing * store. It's always possible that the host could crash after allowing * a query of the value but before it has made it to disk. * * To mitigate this problem, filesystems should always use * inode_set_iversion_queried when loading an existing inode from disk. This * ensures that the next attempted inode increment will result in the value * changing. * * Storing the value to disk therefore does not count as a query, so those * filesystems should use inode_peek_iversion to grab the value to be stored. * There is no need to flag the value as having been queried in that case. */ /* * We borrow the lowest bit in the i_version to use as a flag to tell whether * it has been queried since we last incremented it. If it has, then we must * increment it on the next change. After that, we can clear the flag and * avoid incrementing it again until it has again been queried. */ #define I_VERSION_QUERIED_SHIFT (1) #define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1)) #define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT) /** * inode_set_iversion_raw - set i_version to the specified raw value * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val. This function is for use by * filesystems that self-manage the i_version. * * For example, the NFS client stores its NFSv4 change attribute in this way, * and the AFS client stores the data_version from the server here. */ static inline void inode_set_iversion_raw(struct inode *inode, u64 val) { atomic64_set(&inode->i_version, val); } /** * inode_peek_iversion_raw - grab a "raw" iversion value * @inode: inode from which i_version should be read * * Grab a "raw" inode->i_version value and return it. The i_version is not * flagged or converted in any way. This is mostly used to access a self-managed * i_version. * * With those filesystems, we want to treat the i_version as an entirely * opaque value. */ static inline u64 inode_peek_iversion_raw(const struct inode *inode) { return atomic64_read(&inode->i_version); } /** * inode_set_max_iversion_raw - update i_version new value is larger * @inode: inode to set * @val: new i_version to set * * Some self-managed filesystems (e.g Ceph) will only update the i_version * value if the new value is larger than the one we already have. */ static inline void inode_set_max_iversion_raw(struct inode *inode, u64 val) { u64 cur = inode_peek_iversion_raw(inode); do { if (cur > val) break; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val)); } /** * inode_set_iversion - set i_version to a particular value * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val. This function is for filesystems with * a kernel-managed i_version, for initializing a newly-created inode from * scratch. * * In this case, we do not set the QUERIED flag since we know that this value * has never been queried. */ static inline void inode_set_iversion(struct inode *inode, u64 val) { inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT); } /** * inode_set_iversion_queried - set i_version to a particular value as quereied * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val, and flag it for increment on the next * change. * * Filesystems that persistently store the i_version on disk should use this * when loading an existing inode from disk. * * When loading in an i_version value from a backing store, we can't be certain * that it wasn't previously viewed before being stored. Thus, we must assume * that it was, to ensure that we don't end up handing out the same value for * different versions of the same inode. */ static inline void inode_set_iversion_queried(struct inode *inode, u64 val) { inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) | I_VERSION_QUERIED); } bool inode_maybe_inc_iversion(struct inode *inode, bool force); /** * inode_inc_iversion - forcibly increment i_version * @inode: inode that needs to be updated * * Forcbily increment the i_version field. This always results in a change to * the observable value. */ static inline void inode_inc_iversion(struct inode *inode) { inode_maybe_inc_iversion(inode, true); } /** * inode_iversion_need_inc - is the i_version in need of being incremented? * @inode: inode to check * * Returns whether the inode->i_version counter needs incrementing on the next * change. Just fetch the value and check the QUERIED flag. */ static inline bool inode_iversion_need_inc(struct inode *inode) { return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED; } /** * inode_inc_iversion_raw - forcibly increment raw i_version * @inode: inode that needs to be updated * * Forcbily increment the raw i_version field. This always results in a change * to the raw value. * * NFS will use the i_version field to store the value from the server. It * mostly treats it as opaque, but in the case where it holds a write * delegation, it must increment the value itself. This function does that. */ static inline void inode_inc_iversion_raw(struct inode *inode) { atomic64_inc(&inode->i_version); } /** * inode_peek_iversion - read i_version without flagging it to be incremented * @inode: inode from which i_version should be read * * Read the inode i_version counter for an inode without registering it as a * query. * * This is typically used by local filesystems that need to store an i_version * on disk. In that situation, it's not necessary to flag it as having been * viewed, as the result won't be used to gauge changes from that point. */ static inline u64 inode_peek_iversion(const struct inode *inode) { return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; } /* * For filesystems without any sort of change attribute, the best we can * do is fake one up from the ctime: */ static inline u64 time_to_chattr(const struct timespec64 *t) { u64 chattr = t->tv_sec; chattr <<= 32; chattr += t->tv_nsec; return chattr; } u64 inode_query_iversion(struct inode *inode); /** * inode_eq_iversion_raw - check whether the raw i_version counter has changed * @inode: inode to check * @old: old value to check against its i_version * * Compare the current raw i_version counter with a previous one. Returns true * if they are the same or false if they are different. */ static inline bool inode_eq_iversion_raw(const struct inode *inode, u64 old) { return inode_peek_iversion_raw(inode) == old; } /** * inode_eq_iversion - check whether the i_version counter has changed * @inode: inode to check * @old: old value to check against its i_version * * Compare an i_version counter with a previous one. Returns true if they are * the same, and false if they are different. * * Note that we don't need to set the QUERIED flag in this case, as the value * in the inode is not being recorded for later use. */ static inline bool inode_eq_iversion(const struct inode *inode, u64 old) { return inode_peek_iversion(inode) == old; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 /* SPDX-License-Identifier: GPL-2.0-only */ /* * fwnode.h - Firmware device node object handle type definition. * * This header file provides low-level data types and definitions for firmware * and device property providers. The respective API header files supplied by * them should contain all of the requisite data types and definitions for end * users, so including it directly should not be necessary. * * Copyright (C) 2015, Intel Corporation * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> */ #ifndef _LINUX_FWNODE_H_ #define _LINUX_FWNODE_H_ #include <linux/bits.h> #include <linux/err.h> #include <linux/list.h> #include <linux/types.h> enum dev_dma_attr { DEV_DMA_NOT_SUPPORTED, DEV_DMA_NON_COHERENT, DEV_DMA_COHERENT, }; struct fwnode_operations; struct device; /* * fwnode flags * * LINKS_ADDED: The fwnode has already be parsed to add fwnode links. * NOT_DEVICE: The fwnode will never be populated as a struct device. * INITIALIZED: The hardware corresponding to fwnode has been initialized. * NEEDS_CHILD_BOUND_ON_ADD: For this fwnode/device to probe successfully, its * driver needs its child devices to be bound with * their respective drivers as soon as they are * added. * BEST_EFFORT: The fwnode/device needs to probe early and might be missing some * suppliers. Only enforce ordering with suppliers that have * drivers. */ #define FWNODE_FLAG_LINKS_ADDED BIT(0) #define FWNODE_FLAG_NOT_DEVICE BIT(1) #define FWNODE_FLAG_INITIALIZED BIT(2) #define FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD BIT(3) #define FWNODE_FLAG_BEST_EFFORT BIT(4) #define FWNODE_FLAG_VISITED BIT(5) struct fwnode_handle { struct fwnode_handle *secondary; const struct fwnode_operations *ops; /* The below is used solely by device links, don't use otherwise */ struct device *dev; struct list_head suppliers; struct list_head consumers; u8 flags; }; /* * fwnode link flags * * CYCLE: The fwnode link is part of a cycle. Don't defer probe. * IGNORE: Completely ignore this link, even during cycle detection. */ #define FWLINK_FLAG_CYCLE BIT(0) #define FWLINK_FLAG_IGNORE BIT(1) struct fwnode_link { struct fwnode_handle *supplier; struct list_head s_hook; struct fwnode_handle *consumer; struct list_head c_hook; u8 flags; }; /** * struct fwnode_endpoint - Fwnode graph endpoint * @port: Port number * @id: Endpoint id * @local_fwnode: reference to the related fwnode */ struct fwnode_endpoint { unsigned int port; unsigned int id; const struct fwnode_handle *local_fwnode; }; /* * ports and endpoints defined as software_nodes should all follow a common * naming scheme; use these macros to ensure commonality. */ #define SWNODE_GRAPH_PORT_NAME_FMT "port@%u" #define SWNODE_GRAPH_ENDPOINT_NAME_FMT "endpoint@%u" #define NR_FWNODE_REFERENCE_ARGS 16 /** * struct fwnode_reference_args - Fwnode reference with additional arguments * @fwnode:- A reference to the base fwnode * @nargs: Number of elements in @args array * @args: Integer arguments on the fwnode */ struct fwnode_reference_args { struct fwnode_handle *fwnode; unsigned int nargs; u64 args[NR_FWNODE_REFERENCE_ARGS]; }; /** * struct fwnode_operations - Operations for fwnode interface * @get: Get a reference to an fwnode. * @put: Put a reference to an fwnode. * @device_is_available: Return true if the device is available. * @device_get_match_data: Return the device driver match data. * @property_present: Return true if a property is present. * @property_read_bool: Return a boolean property value. * @property_read_int_array: Read an array of integer properties. Return zero on * success, a negative error code otherwise. * @property_read_string_array: Read an array of string properties. Return zero * on success, a negative error code otherwise. * @get_name: Return the name of an fwnode. * @get_name_prefix: Get a prefix for a node (for printing purposes). * @get_parent: Return the parent of an fwnode. * @get_next_child_node: Return the next child node in an iteration. * @get_named_child_node: Return a child node with a given name. * @get_reference_args: Return a reference pointed to by a property, with args * @graph_get_next_endpoint: Return an endpoint node in an iteration. * @graph_get_remote_endpoint: Return the remote endpoint node of a local * endpoint node. * @graph_get_port_parent: Return the parent node of a port node. * @graph_parse_endpoint: Parse endpoint for port and endpoint id. * @add_links: Create fwnode links to all the suppliers of the fwnode. Return * zero on success, a negative error code otherwise. */ struct fwnode_operations { struct fwnode_handle *(*get)(struct fwnode_handle *fwnode); void (*put)(struct fwnode_handle *fwnode); bool (*device_is_available)(const struct fwnode_handle *fwnode); const void *(*device_get_match_data)(const struct fwnode_handle *fwnode, const struct device *dev); bool (*device_dma_supported)(const struct fwnode_handle *fwnode); enum dev_dma_attr (*device_get_dma_attr)(const struct fwnode_handle *fwnode); bool (*property_present)(const struct fwnode_handle *fwnode, const char *propname); bool (*property_read_bool)(const struct fwnode_handle *fwnode, const char *propname); int (*property_read_int_array)(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval); int (*property_read_string_array)(const struct fwnode_handle *fwnode_handle, const char *propname, const char **val, size_t nval); const char *(*get_name)(const struct fwnode_handle *fwnode); const char *(*get_name_prefix)(const struct fwnode_handle *fwnode); struct fwnode_handle *(*get_parent)(const struct fwnode_handle *fwnode); struct fwnode_handle * (*get_next_child_node)(const struct fwnode_handle *fwnode, struct fwnode_handle *child); struct fwnode_handle * (*get_named_child_node)(const struct fwnode_handle *fwnode, const char *name); int (*get_reference_args)(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, unsigned int nargs, unsigned int index, struct fwnode_reference_args *args); struct fwnode_handle * (*graph_get_next_endpoint)(const struct fwnode_handle *fwnode, struct fwnode_handle *prev); struct fwnode_handle * (*graph_get_remote_endpoint)(const struct fwnode_handle *fwnode); struct fwnode_handle * (*graph_get_port_parent)(struct fwnode_handle *fwnode); int (*graph_parse_endpoint)(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint); void __iomem *(*iomap)(struct fwnode_handle *fwnode, int index); int (*irq_get)(const struct fwnode_handle *fwnode, unsigned int index); int (*add_links)(struct fwnode_handle *fwnode); }; #define fwnode_has_op(fwnode, op) \ (!IS_ERR_OR_NULL(fwnode) && (fwnode)->ops && (fwnode)->ops->op) #define fwnode_call_int_op(fwnode, op, ...) \ (fwnode_has_op(fwnode, op) ? \ (fwnode)->ops->op(fwnode, ## __VA_ARGS__) : (IS_ERR_OR_NULL(fwnode) ? -EINVAL : -ENXIO)) #define fwnode_call_bool_op(fwnode, op, ...) \ (fwnode_has_op(fwnode, op) ? \ (fwnode)->ops->op(fwnode, ## __VA_ARGS__) : false) #define fwnode_call_ptr_op(fwnode, op, ...) \ (fwnode_has_op(fwnode, op) ? \ (fwnode)->ops->op(fwnode, ## __VA_ARGS__) : NULL) #define fwnode_call_void_op(fwnode, op, ...) \ do { \ if (fwnode_has_op(fwnode, op)) \ (fwnode)->ops->op(fwnode, ## __VA_ARGS__); \ } while (false) static inline void fwnode_init(struct fwnode_handle *fwnode, const struct fwnode_operations *ops) { fwnode->ops = ops; INIT_LIST_HEAD(&fwnode->consumers); INIT_LIST_HEAD(&fwnode->suppliers); } static inline void fwnode_dev_initialized(struct fwnode_handle *fwnode, bool initialized) { if (IS_ERR_OR_NULL(fwnode)) return; if (initialized) fwnode->flags |= FWNODE_FLAG_INITIALIZED; else fwnode->flags &= ~FWNODE_FLAG_INITIALIZED; } int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup, u8 flags); void fwnode_links_purge(struct fwnode_handle *fwnode); void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode); bool fw_devlink_is_strict(void); #endif
3 3 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 3 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 4 4 3 2 2 2 4 2 2 2 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <trace/events/devlink.h> #include "devl_internal.h" struct devlink_stats { u64_stats_t rx_bytes; u64_stats_t rx_packets; struct u64_stats_sync syncp; }; /** * struct devlink_trap_policer_item - Packet trap policer attributes. * @policer: Immutable packet trap policer attributes. * @rate: Rate in packets / sec. * @burst: Burst size in packets. * @list: trap_policer_list member. * * Describes packet trap policer attributes. Created by devlink during trap * policer registration. */ struct devlink_trap_policer_item { const struct devlink_trap_policer *policer; u64 rate; u64 burst; struct list_head list; }; /** * struct devlink_trap_group_item - Packet trap group attributes. * @group: Immutable packet trap group attributes. * @policer_item: Associated policer item. Can be NULL. * @list: trap_group_list member. * @stats: Trap group statistics. * * Describes packet trap group attributes. Created by devlink during trap * group registration. */ struct devlink_trap_group_item { const struct devlink_trap_group *group; struct devlink_trap_policer_item *policer_item; struct list_head list; struct devlink_stats __percpu *stats; }; /** * struct devlink_trap_item - Packet trap attributes. * @trap: Immutable packet trap attributes. * @group_item: Associated group item. * @list: trap_list member. * @action: Trap action. * @stats: Trap statistics. * @priv: Driver private information. * * Describes both mutable and immutable packet trap attributes. Created by * devlink during trap registration and used for all trap related operations. */ struct devlink_trap_item { const struct devlink_trap *trap; struct devlink_trap_group_item *group_item; struct list_head list; enum devlink_trap_action action; struct devlink_stats __percpu *stats; void *priv; }; static struct devlink_trap_policer_item * devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id) { struct devlink_trap_policer_item *policer_item; list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { if (policer_item->policer->id == id) return policer_item; } return NULL; } static struct devlink_trap_item * devlink_trap_item_lookup(struct devlink *devlink, const char *name) { struct devlink_trap_item *trap_item; list_for_each_entry(trap_item, &devlink->trap_list, list) { if (!strcmp(trap_item->trap->name, name)) return trap_item; } return NULL; } static struct devlink_trap_item * devlink_trap_item_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr *attr; if (!info->attrs[DEVLINK_ATTR_TRAP_NAME]) return NULL; attr = info->attrs[DEVLINK_ATTR_TRAP_NAME]; return devlink_trap_item_lookup(devlink, nla_data(attr)); } static int devlink_trap_action_get_from_info(struct genl_info *info, enum devlink_trap_action *p_trap_action) { u8 val; val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); switch (val) { case DEVLINK_TRAP_ACTION_DROP: case DEVLINK_TRAP_ACTION_TRAP: case DEVLINK_TRAP_ACTION_MIRROR: *p_trap_action = val; break; default: return -EINVAL; } return 0; } static int devlink_trap_metadata_put(struct sk_buff *msg, const struct devlink_trap *trap) { struct nlattr *attr; attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA); if (!attr) return -EMSGSIZE; if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) && nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT)) goto nla_put_failure; if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) && nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, struct devlink_stats *stats) { int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { struct devlink_stats *cpu_stats; u64 rx_packets, rx_bytes; unsigned int start; cpu_stats = per_cpu_ptr(trap_stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); rx_packets = u64_stats_read(&cpu_stats->rx_packets); rx_bytes = u64_stats_read(&cpu_stats->rx_bytes); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rx_packets); u64_stats_add(&stats->rx_bytes, rx_bytes); } } static int devlink_trap_group_stats_put(struct sk_buff *msg, struct devlink_stats __percpu *trap_stats) { struct devlink_stats stats; struct nlattr *attr; devlink_trap_stats_read(trap_stats, &stats); attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); if (!attr) return -EMSGSIZE; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS, u64_stats_read(&stats.rx_packets))) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES, u64_stats_read(&stats.rx_bytes))) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_item *trap_item) { struct devlink_stats stats; struct nlattr *attr; u64 drops = 0; int err; if (devlink->ops->trap_drop_counter_get) { err = devlink->ops->trap_drop_counter_get(devlink, trap_item->trap, &drops); if (err) return err; } devlink_trap_stats_read(trap_item->stats, &stats); attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); if (!attr) return -EMSGSIZE; if (devlink->ops->trap_drop_counter_get && devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS, u64_stats_read(&stats.rx_packets))) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES, u64_stats_read(&stats.rx_bytes))) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { struct devlink_trap_group_item *group_item = trap_item->group_item; void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, group_item->group->name)) goto nla_put_failure; if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type)) goto nla_put_failure; if (trap_item->trap->generic && nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action)) goto nla_put_failure; err = devlink_trap_metadata_put(msg, trap_item->trap); if (err) goto nla_put_failure; err = devlink_trap_stats_put(msg, devlink, trap_item); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_item *trap_item; struct sk_buff *msg; int err; if (list_empty(&devlink->trap_list)) return -EOPNOTSUPP; trap_item = devlink_trap_item_get_from_info(devlink, info); if (!trap_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap"); return -ENOENT; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_trap_fill(msg, devlink, trap_item, DEVLINK_CMD_TRAP_NEW, info->snd_portid, info->snd_seq, 0); if (err) goto err_trap_fill; return genlmsg_reply(msg, info); err_trap_fill: nlmsg_free(msg); return err; } static int devlink_nl_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_item *trap_item; int idx = 0; int err = 0; list_for_each_entry(trap_item, &devlink->trap_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_trap_fill(msg, devlink, trap_item, DEVLINK_CMD_TRAP_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_trap_get_dump_one); } static int __devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, enum devlink_trap_action trap_action, struct netlink_ext_ack *extack) { int err; if (trap_item->action != trap_action && trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) { NL_SET_ERR_MSG(extack, "Cannot change action of non-drop traps. Skipping"); return 0; } err = devlink->ops->trap_action_set(devlink, trap_item->trap, trap_action, extack); if (err) return err; trap_item->action = trap_action; return 0; } static int devlink_trap_action_set(struct devlink *devlink, struct devlink_trap_item *trap_item, struct genl_info *info) { enum devlink_trap_action trap_action; int err; if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) return 0; err = devlink_trap_action_get_from_info(info, &trap_action); if (err) { NL_SET_ERR_MSG(info->extack, "Invalid trap action"); return -EINVAL; } return __devlink_trap_action_set(devlink, trap_item, trap_action, info->extack); } int devlink_nl_trap_set_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_item *trap_item; if (list_empty(&devlink->trap_list)) return -EOPNOTSUPP; trap_item = devlink_trap_item_get_from_info(devlink, info); if (!trap_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap"); return -ENOENT; } return devlink_trap_action_set(devlink, trap_item, info); } static struct devlink_trap_group_item * devlink_trap_group_item_lookup(struct devlink *devlink, const char *name) { struct devlink_trap_group_item *group_item; list_for_each_entry(group_item, &devlink->trap_group_list, list) { if (!strcmp(group_item->group->name, name)) return group_item; } return NULL; } static struct devlink_trap_group_item * devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id) { struct devlink_trap_group_item *group_item; list_for_each_entry(group_item, &devlink->trap_group_list, list) { if (group_item->group->id == id) return group_item; } return NULL; } static struct devlink_trap_group_item * devlink_trap_group_item_get_from_info(struct devlink *devlink, struct genl_info *info) { char *name; if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]) return NULL; name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]); return devlink_trap_group_item_lookup(devlink, name); } static int devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_group_item *group_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME, group_item->group->name)) goto nla_put_failure; if (group_item->group->generic && nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) goto nla_put_failure; if (group_item->policer_item && nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, group_item->policer_item->policer->id)) goto nla_put_failure; err = devlink_trap_group_stats_put(msg, group_item->stats); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_group_item *group_item; struct sk_buff *msg; int err; if (list_empty(&devlink->trap_group_list)) return -EOPNOTSUPP; group_item = devlink_trap_group_item_get_from_info(devlink, info); if (!group_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap group"); return -ENOENT; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_trap_group_fill(msg, devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW, info->snd_portid, info->snd_seq, 0); if (err) goto err_trap_group_fill; return genlmsg_reply(msg, info); err_trap_group_fill: nlmsg_free(msg); return err; } static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_group_item *group_item; int idx = 0; int err = 0; list_for_each_entry(group_item, &devlink->trap_group_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_trap_group_fill(msg, devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_trap_group_get_dump_one); } static int __devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, enum devlink_trap_action trap_action, struct netlink_ext_ack *extack) { const char *group_name = group_item->group->name; struct devlink_trap_item *trap_item; int err; if (devlink->ops->trap_group_action_set) { err = devlink->ops->trap_group_action_set(devlink, group_item->group, trap_action, extack); if (err) return err; list_for_each_entry(trap_item, &devlink->trap_list, list) { if (strcmp(trap_item->group_item->group->name, group_name)) continue; if (trap_item->action != trap_action && trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) continue; trap_item->action = trap_action; } return 0; } list_for_each_entry(trap_item, &devlink->trap_list, list) { if (strcmp(trap_item->group_item->group->name, group_name)) continue; err = __devlink_trap_action_set(devlink, trap_item, trap_action, extack); if (err) return err; } return 0; } static int devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, struct genl_info *info, bool *p_modified) { enum devlink_trap_action trap_action; int err; if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION]) return 0; err = devlink_trap_action_get_from_info(info, &trap_action); if (err) { NL_SET_ERR_MSG(info->extack, "Invalid trap action"); return -EINVAL; } err = __devlink_trap_group_action_set(devlink, group_item, trap_action, info->extack); if (err) return err; *p_modified = true; return 0; } static int devlink_trap_group_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; const struct devlink_trap_policer *policer; struct nlattr **attrs = info->attrs; u32 policer_id; int err; if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) return 0; if (!devlink->ops->trap_group_set) return -EOPNOTSUPP; policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); if (policer_id && !policer_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } policer = policer_item ? policer_item->policer : NULL; err = devlink->ops->trap_group_set(devlink, group_item->group, policer, extack); if (err) return err; group_item->policer_item = policer_item; return 0; } int devlink_nl_trap_group_set_doit(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_group_item *group_item; bool modified = false; int err; if (list_empty(&devlink->trap_group_list)) return -EOPNOTSUPP; group_item = devlink_trap_group_item_get_from_info(devlink, info); if (!group_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap group"); return -ENOENT; } err = devlink_trap_group_action_set(devlink, group_item, info, &modified); if (err) return err; err = devlink_trap_group_set(devlink, group_item, info); if (err) goto err_trap_group_set; return 0; err_trap_group_set: if (modified) NL_SET_ERR_MSG(extack, "Trap group set failed, but some changes were committed already"); return err; } static struct devlink_trap_policer_item * devlink_trap_policer_item_get_from_info(struct devlink *devlink, struct genl_info *info) { u32 id; if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) return NULL; id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); return devlink_trap_policer_item_lookup(devlink, id); } static int devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_policer *policer) { struct nlattr *attr; u64 drops; int err; if (!devlink->ops->trap_policer_counter_get) return 0; err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops); if (err) return err; attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); if (!attr) return -EMSGSIZE; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink, const struct devlink_trap_policer_item *policer_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, policer_item->policer->id)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_RATE, policer_item->rate)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_BURST, policer_item->burst)) goto nla_put_failure; err = devlink_trap_policer_stats_put(msg, devlink, policer_item->policer); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int devlink_nl_trap_policer_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct sk_buff *msg; int err; if (list_empty(&devlink->trap_policer_list)) return -EOPNOTSUPP; policer_item = devlink_trap_policer_item_get_from_info(devlink, info); if (!policer_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW, info->snd_portid, info->snd_seq, 0); if (err) goto err_trap_policer_fill; return genlmsg_reply(msg, info); err_trap_policer_fill: nlmsg_free(msg); return err; } static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_trap_policer_item *policer_item; int idx = 0; int err = 0; list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_trap_policer_get_dump_one); } static int devlink_trap_policer_set(struct devlink *devlink, struct devlink_trap_policer_item *policer_item, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; struct nlattr **attrs = info->attrs; u64 rate, burst; int err; rate = policer_item->rate; burst = policer_item->burst; if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]) rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]); if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]) burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]); if (rate < policer_item->policer->min_rate) { NL_SET_ERR_MSG(extack, "Policer rate lower than limit"); return -EINVAL; } if (rate > policer_item->policer->max_rate) { NL_SET_ERR_MSG(extack, "Policer rate higher than limit"); return -EINVAL; } if (burst < policer_item->policer->min_burst) { NL_SET_ERR_MSG(extack, "Policer burst size lower than limit"); return -EINVAL; } if (burst > policer_item->policer->max_burst) { NL_SET_ERR_MSG(extack, "Policer burst size higher than limit"); return -EINVAL; } err = devlink->ops->trap_policer_set(devlink, policer_item->policer, rate, burst, info->extack); if (err) return err; policer_item->rate = rate; policer_item->burst = burst; return 0; } int devlink_nl_trap_policer_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_trap_policer_item *policer_item; struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; if (list_empty(&devlink->trap_policer_list)) return -EOPNOTSUPP; if (!devlink->ops->trap_policer_set) return -EOPNOTSUPP; policer_item = devlink_trap_policer_item_get_from_info(devlink, info); if (!policer_item) { NL_SET_ERR_MSG(extack, "Device did not register this trap policer"); return -ENOENT; } return devlink_trap_policer_set(devlink, policer_item, info); } #define DEVLINK_TRAP(_id, _type) \ { \ .type = DEVLINK_TRAP_TYPE_##_type, \ .id = DEVLINK_TRAP_GENERIC_ID_##_id, \ .name = DEVLINK_TRAP_GENERIC_NAME_##_id, \ } static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(SMAC_MC, DROP), DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP), DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP), DEVLINK_TRAP(INGRESS_STP_FILTER, DROP), DEVLINK_TRAP(EMPTY_TX_LIST, DROP), DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP), DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP), DEVLINK_TRAP(TTL_ERROR, EXCEPTION), DEVLINK_TRAP(TAIL_DROP, DROP), DEVLINK_TRAP(NON_IP_PACKET, DROP), DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP), DEVLINK_TRAP(DIP_LB, DROP), DEVLINK_TRAP(SIP_MC, DROP), DEVLINK_TRAP(SIP_LB, DROP), DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP), DEVLINK_TRAP(IPV4_SIP_BC, DROP), DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP), DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP), DEVLINK_TRAP(MTU_ERROR, EXCEPTION), DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION), DEVLINK_TRAP(RPF, EXCEPTION), DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION), DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION), DEVLINK_TRAP(NON_ROUTABLE, DROP), DEVLINK_TRAP(DECAP_ERROR, EXCEPTION), DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(STP, CONTROL), DEVLINK_TRAP(LACP, CONTROL), DEVLINK_TRAP(LLDP, CONTROL), DEVLINK_TRAP(IGMP_QUERY, CONTROL), DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), DEVLINK_TRAP(MLD_QUERY, CONTROL), DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), DEVLINK_TRAP(MLD_V1_DONE, CONTROL), DEVLINK_TRAP(IPV4_DHCP, CONTROL), DEVLINK_TRAP(IPV6_DHCP, CONTROL), DEVLINK_TRAP(ARP_REQUEST, CONTROL), DEVLINK_TRAP(ARP_RESPONSE, CONTROL), DEVLINK_TRAP(ARP_OVERLAY, CONTROL), DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), DEVLINK_TRAP(IPV4_BFD, CONTROL), DEVLINK_TRAP(IPV6_BFD, CONTROL), DEVLINK_TRAP(IPV4_OSPF, CONTROL), DEVLINK_TRAP(IPV6_OSPF, CONTROL), DEVLINK_TRAP(IPV4_BGP, CONTROL), DEVLINK_TRAP(IPV6_BGP, CONTROL), DEVLINK_TRAP(IPV4_VRRP, CONTROL), DEVLINK_TRAP(IPV6_VRRP, CONTROL), DEVLINK_TRAP(IPV4_PIM, CONTROL), DEVLINK_TRAP(IPV6_PIM, CONTROL), DEVLINK_TRAP(UC_LB, CONTROL), DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), DEVLINK_TRAP(PTP_EVENT, CONTROL), DEVLINK_TRAP(PTP_GENERAL, CONTROL), DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), DEVLINK_TRAP(EARLY_DROP, DROP), DEVLINK_TRAP(VXLAN_PARSING, DROP), DEVLINK_TRAP(LLC_SNAP_PARSING, DROP), DEVLINK_TRAP(VLAN_PARSING, DROP), DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP), DEVLINK_TRAP(MPLS_PARSING, DROP), DEVLINK_TRAP(ARP_PARSING, DROP), DEVLINK_TRAP(IP_1_PARSING, DROP), DEVLINK_TRAP(IP_N_PARSING, DROP), DEVLINK_TRAP(GRE_PARSING, DROP), DEVLINK_TRAP(UDP_PARSING, DROP), DEVLINK_TRAP(TCP_PARSING, DROP), DEVLINK_TRAP(IPSEC_PARSING, DROP), DEVLINK_TRAP(SCTP_PARSING, DROP), DEVLINK_TRAP(DCCP_PARSING, DROP), DEVLINK_TRAP(GTP_PARSING, DROP), DEVLINK_TRAP(ESP_PARSING, DROP), DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), DEVLINK_TRAP(DMAC_FILTER, DROP), DEVLINK_TRAP(EAPOL, CONTROL), DEVLINK_TRAP(LOCKED_PORT, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ { \ .id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \ .name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \ } static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L2_DROPS), DEVLINK_TRAP_GROUP(L3_DROPS), DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), DEVLINK_TRAP_GROUP(STP), DEVLINK_TRAP_GROUP(LACP), DEVLINK_TRAP_GROUP(LLDP), DEVLINK_TRAP_GROUP(MC_SNOOPING), DEVLINK_TRAP_GROUP(DHCP), DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), DEVLINK_TRAP_GROUP(BFD), DEVLINK_TRAP_GROUP(OSPF), DEVLINK_TRAP_GROUP(BGP), DEVLINK_TRAP_GROUP(VRRP), DEVLINK_TRAP_GROUP(PIM), DEVLINK_TRAP_GROUP(UC_LB), DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), DEVLINK_TRAP_GROUP(EXTERNAL_DELIVERY), DEVLINK_TRAP_GROUP(IPV6), DEVLINK_TRAP_GROUP(PTP_EVENT), DEVLINK_TRAP_GROUP(PTP_GENERAL), DEVLINK_TRAP_GROUP(ACL_SAMPLE), DEVLINK_TRAP_GROUP(ACL_TRAP), DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS), DEVLINK_TRAP_GROUP(EAPOL), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) { if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX) return -EINVAL; if (strcmp(trap->name, devlink_trap_generic[trap->id].name)) return -EINVAL; if (trap->type != devlink_trap_generic[trap->id].type) return -EINVAL; return 0; } static int devlink_trap_driver_verify(const struct devlink_trap *trap) { int i; if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX) return -EINVAL; for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) { if (!strcmp(trap->name, devlink_trap_generic[i].name)) return -EEXIST; } return 0; } static int devlink_trap_verify(const struct devlink_trap *trap) { if (!trap || !trap->name) return -EINVAL; if (trap->generic) return devlink_trap_generic_verify(trap); else return devlink_trap_driver_verify(trap); } static int devlink_trap_group_generic_verify(const struct devlink_trap_group *group) { if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) return -EINVAL; if (strcmp(group->name, devlink_trap_group_generic[group->id].name)) return -EINVAL; return 0; } static int devlink_trap_group_driver_verify(const struct devlink_trap_group *group) { int i; if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX) return -EINVAL; for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) { if (!strcmp(group->name, devlink_trap_group_generic[i].name)) return -EEXIST; } return 0; } static int devlink_trap_group_verify(const struct devlink_trap_group *group) { if (group->generic) return devlink_trap_group_generic_verify(group); else return devlink_trap_group_driver_verify(group); } static void devlink_trap_group_notify(struct devlink *devlink, const struct devlink_trap_group_item *group_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW && cmd != DEVLINK_CMD_TRAP_GROUP_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_trap_groups_notify_register(struct devlink *devlink) { struct devlink_trap_group_item *group_item; list_for_each_entry(group_item, &devlink->trap_group_list, list) devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW); } void devlink_trap_groups_notify_unregister(struct devlink *devlink) { struct devlink_trap_group_item *group_item; list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list) devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_DEL); } static int devlink_trap_item_group_link(struct devlink *devlink, struct devlink_trap_item *trap_item) { u16 group_id = trap_item->trap->init_group_id; struct devlink_trap_group_item *group_item; group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id); if (WARN_ON_ONCE(!group_item)) return -EINVAL; trap_item->group_item = group_item; return 0; } static void devlink_trap_notify(struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW && cmd != DEVLINK_CMD_TRAP_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_traps_notify_register(struct devlink *devlink) { struct devlink_trap_item *trap_item; list_for_each_entry(trap_item, &devlink->trap_list, list) devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); } void devlink_traps_notify_unregister(struct devlink *devlink) { struct devlink_trap_item *trap_item; list_for_each_entry_reverse(trap_item, &devlink->trap_list, list) devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); } static int devlink_trap_register(struct devlink *devlink, const struct devlink_trap *trap, void *priv) { struct devlink_trap_item *trap_item; int err; if (devlink_trap_item_lookup(devlink, trap->name)) return -EEXIST; trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL); if (!trap_item) return -ENOMEM; trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); if (!trap_item->stats) { err = -ENOMEM; goto err_stats_alloc; } trap_item->trap = trap; trap_item->action = trap->init_action; trap_item->priv = priv; err = devlink_trap_item_group_link(devlink, trap_item); if (err) goto err_group_link; err = devlink->ops->trap_init(devlink, trap, trap_item); if (err) goto err_trap_init; list_add_tail(&trap_item->list, &devlink->trap_list); devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); return 0; err_trap_init: err_group_link: free_percpu(trap_item->stats); err_stats_alloc: kfree(trap_item); return err; } static void devlink_trap_unregister(struct devlink *devlink, const struct devlink_trap *trap) { struct devlink_trap_item *trap_item; trap_item = devlink_trap_item_lookup(devlink, trap->name); if (WARN_ON_ONCE(!trap_item)) return; devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); list_del(&trap_item->list); if (devlink->ops->trap_fini) devlink->ops->trap_fini(devlink, trap, trap_item); free_percpu(trap_item->stats); kfree(trap_item); } static void devlink_trap_disable(struct devlink *devlink, const struct devlink_trap *trap) { struct devlink_trap_item *trap_item; trap_item = devlink_trap_item_lookup(devlink, trap->name); if (WARN_ON_ONCE(!trap_item)) return; devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP, NULL); trap_item->action = DEVLINK_TRAP_ACTION_DROP; } /** * devl_traps_register - Register packet traps with devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. * @priv: Driver private information. * * Return: Non-zero value on failure. */ int devl_traps_register(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count, void *priv) { int i, err; if (!devlink->ops->trap_init || !devlink->ops->trap_action_set) return -EINVAL; devl_assert_locked(devlink); for (i = 0; i < traps_count; i++) { const struct devlink_trap *trap = &traps[i]; err = devlink_trap_verify(trap); if (err) goto err_trap_verify; err = devlink_trap_register(devlink, trap, priv); if (err) goto err_trap_register; } return 0; err_trap_register: err_trap_verify: for (i--; i >= 0; i--) devlink_trap_unregister(devlink, &traps[i]); return err; } EXPORT_SYMBOL_GPL(devl_traps_register); /** * devlink_traps_register - Register packet traps with devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. * @priv: Driver private information. * * Context: Takes and release devlink->lock <mutex>. * * Return: Non-zero value on failure. */ int devlink_traps_register(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count, void *priv) { int err; devl_lock(devlink); err = devl_traps_register(devlink, traps, traps_count, priv); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_traps_register); /** * devl_traps_unregister - Unregister packet traps from devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. */ void devl_traps_unregister(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count) { int i; devl_assert_locked(devlink); /* Make sure we do not have any packets in-flight while unregistering * traps by disabling all of them and waiting for a grace period. */ for (i = traps_count - 1; i >= 0; i--) devlink_trap_disable(devlink, &traps[i]); synchronize_rcu(); for (i = traps_count - 1; i >= 0; i--) devlink_trap_unregister(devlink, &traps[i]); } EXPORT_SYMBOL_GPL(devl_traps_unregister); /** * devlink_traps_unregister - Unregister packet traps from devlink. * @devlink: devlink. * @traps: Packet traps. * @traps_count: Count of provided packet traps. * * Context: Takes and release devlink->lock <mutex>. */ void devlink_traps_unregister(struct devlink *devlink, const struct devlink_trap *traps, size_t traps_count) { devl_lock(devlink); devl_traps_unregister(devlink, traps, traps_count); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_traps_unregister); static void devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats, size_t skb_len) { struct devlink_stats *stats; stats = this_cpu_ptr(trap_stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->rx_bytes, skb_len); u64_stats_inc(&stats->rx_packets); u64_stats_update_end(&stats->syncp); } static void devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata, const struct devlink_trap_item *trap_item, struct devlink_port *in_devlink_port, const struct flow_action_cookie *fa_cookie) { metadata->trap_name = trap_item->trap->name; metadata->trap_group_name = trap_item->group_item->group->name; metadata->fa_cookie = fa_cookie; metadata->trap_type = trap_item->trap->type; spin_lock(&in_devlink_port->type_lock); if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH) metadata->input_dev = in_devlink_port->type_eth.netdev; spin_unlock(&in_devlink_port->type_lock); } /** * devlink_trap_report - Report trapped packet to drop monitor. * @devlink: devlink. * @skb: Trapped packet. * @trap_ctx: Trap context. * @in_devlink_port: Input devlink port. * @fa_cookie: Flow action cookie. Could be NULL. */ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, void *trap_ctx, struct devlink_port *in_devlink_port, const struct flow_action_cookie *fa_cookie) { struct devlink_trap_item *trap_item = trap_ctx; devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); if (tracepoint_enabled(devlink_trap_report)) { struct devlink_trap_metadata metadata = {}; devlink_trap_report_metadata_set(&metadata, trap_item, in_devlink_port, fa_cookie); trace_devlink_trap_report(devlink, skb, &metadata); } } EXPORT_SYMBOL_GPL(devlink_trap_report); /** * devlink_trap_ctx_priv - Trap context to driver private information. * @trap_ctx: Trap context. * * Return: Driver private information passed during registration. */ void *devlink_trap_ctx_priv(void *trap_ctx) { struct devlink_trap_item *trap_item = trap_ctx; return trap_item->priv; } EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv); static int devlink_trap_group_item_policer_link(struct devlink *devlink, struct devlink_trap_group_item *group_item) { u32 policer_id = group_item->group->init_policer_id; struct devlink_trap_policer_item *policer_item; if (policer_id == 0) return 0; policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); if (WARN_ON_ONCE(!policer_item)) return -EINVAL; group_item->policer_item = policer_item; return 0; } static int devlink_trap_group_register(struct devlink *devlink, const struct devlink_trap_group *group) { struct devlink_trap_group_item *group_item; int err; if (devlink_trap_group_item_lookup(devlink, group->name)) return -EEXIST; group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); if (!group_item) return -ENOMEM; group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); if (!group_item->stats) { err = -ENOMEM; goto err_stats_alloc; } group_item->group = group; err = devlink_trap_group_item_policer_link(devlink, group_item); if (err) goto err_policer_link; if (devlink->ops->trap_group_init) { err = devlink->ops->trap_group_init(devlink, group); if (err) goto err_group_init; } list_add_tail(&group_item->list, &devlink->trap_group_list); devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_NEW); return 0; err_group_init: err_policer_link: free_percpu(group_item->stats); err_stats_alloc: kfree(group_item); return err; } static void devlink_trap_group_unregister(struct devlink *devlink, const struct devlink_trap_group *group) { struct devlink_trap_group_item *group_item; group_item = devlink_trap_group_item_lookup(devlink, group->name); if (WARN_ON_ONCE(!group_item)) return; devlink_trap_group_notify(devlink, group_item, DEVLINK_CMD_TRAP_GROUP_DEL); list_del(&group_item->list); free_percpu(group_item->stats); kfree(group_item); } /** * devl_trap_groups_register - Register packet trap groups with devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. * * Return: Non-zero value on failure. */ int devl_trap_groups_register(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { int i, err; devl_assert_locked(devlink); for (i = 0; i < groups_count; i++) { const struct devlink_trap_group *group = &groups[i]; err = devlink_trap_group_verify(group); if (err) goto err_trap_group_verify; err = devlink_trap_group_register(devlink, group); if (err) goto err_trap_group_register; } return 0; err_trap_group_register: err_trap_group_verify: for (i--; i >= 0; i--) devlink_trap_group_unregister(devlink, &groups[i]); return err; } EXPORT_SYMBOL_GPL(devl_trap_groups_register); /** * devlink_trap_groups_register - Register packet trap groups with devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. * * Context: Takes and release devlink->lock <mutex>. * * Return: Non-zero value on failure. */ int devlink_trap_groups_register(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { int err; devl_lock(devlink); err = devl_trap_groups_register(devlink, groups, groups_count); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_trap_groups_register); /** * devl_trap_groups_unregister - Unregister packet trap groups from devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. */ void devl_trap_groups_unregister(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { int i; devl_assert_locked(devlink); for (i = groups_count - 1; i >= 0; i--) devlink_trap_group_unregister(devlink, &groups[i]); } EXPORT_SYMBOL_GPL(devl_trap_groups_unregister); /** * devlink_trap_groups_unregister - Unregister packet trap groups from devlink. * @devlink: devlink. * @groups: Packet trap groups. * @groups_count: Count of provided packet trap groups. * * Context: Takes and release devlink->lock <mutex>. */ void devlink_trap_groups_unregister(struct devlink *devlink, const struct devlink_trap_group *groups, size_t groups_count) { devl_lock(devlink); devl_trap_groups_unregister(devlink, groups, groups_count); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister); static void devlink_trap_policer_notify(struct devlink *devlink, const struct devlink_trap_policer_item *policer_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && cmd != DEVLINK_CMD_TRAP_POLICER_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_trap_policers_notify_register(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; list_for_each_entry(policer_item, &devlink->trap_policer_list, list) devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW); } void devlink_trap_policers_notify_unregister(struct devlink *devlink) { struct devlink_trap_policer_item *policer_item; list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list, list) devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_DEL); } static int devlink_trap_policer_register(struct devlink *devlink, const struct devlink_trap_policer *policer) { struct devlink_trap_policer_item *policer_item; int err; if (devlink_trap_policer_item_lookup(devlink, policer->id)) return -EEXIST; policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL); if (!policer_item) return -ENOMEM; policer_item->policer = policer; policer_item->rate = policer->init_rate; policer_item->burst = policer->init_burst; if (devlink->ops->trap_policer_init) { err = devlink->ops->trap_policer_init(devlink, policer); if (err) goto err_policer_init; } list_add_tail(&policer_item->list, &devlink->trap_policer_list); devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_NEW); return 0; err_policer_init: kfree(policer_item); return err; } static void devlink_trap_policer_unregister(struct devlink *devlink, const struct devlink_trap_policer *policer) { struct devlink_trap_policer_item *policer_item; policer_item = devlink_trap_policer_item_lookup(devlink, policer->id); if (WARN_ON_ONCE(!policer_item)) return; devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_DEL); list_del(&policer_item->list); if (devlink->ops->trap_policer_fini) devlink->ops->trap_policer_fini(devlink, policer); kfree(policer_item); } /** * devl_trap_policers_register - Register packet trap policers with devlink. * @devlink: devlink. * @policers: Packet trap policers. * @policers_count: Count of provided packet trap policers. * * Return: Non-zero value on failure. */ int devl_trap_policers_register(struct devlink *devlink, const struct devlink_trap_policer *policers, size_t policers_count) { int i, err; devl_assert_locked(devlink); for (i = 0; i < policers_count; i++) { const struct devlink_trap_policer *policer = &policers[i]; if (WARN_ON(policer->id == 0 || policer->max_rate < policer->min_rate || policer->max_burst < policer->min_burst)) { err = -EINVAL; goto err_trap_policer_verify; } err = devlink_trap_policer_register(devlink, policer); if (err) goto err_trap_policer_register; } return 0; err_trap_policer_register: err_trap_policer_verify: for (i--; i >= 0; i--) devlink_trap_policer_unregister(devlink, &policers[i]); return err; } EXPORT_SYMBOL_GPL(devl_trap_policers_register); /** * devl_trap_policers_unregister - Unregister packet trap policers from devlink. * @devlink: devlink. * @policers: Packet trap policers. * @policers_count: Count of provided packet trap policers. */ void devl_trap_policers_unregister(struct devlink *devlink, const struct devlink_trap_policer *policers, size_t policers_count) { int i; devl_assert_locked(devlink); for (i = policers_count - 1; i >= 0; i--) devlink_trap_policer_unregister(devlink, &policers[i]); } EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 // SPDX-License-Identifier: GPL-2.0-or-later /* * SR-IPv6 implementation -- HMAC functions * * Author: * David Lebrun <david.lebrun@uclouvain.be> */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/mroute6.h> #include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <crypto/sha1.h> #include <crypto/sha2.h> #include <crypto/utils.h> #include <net/seg6.h> #include <net/genetlink.h> #include <net/seg6_hmac.h> #include <linux/random.h> struct hmac_storage { local_lock_t bh_lock; char hmac_ring[SEG6_HMAC_RING_SIZE]; }; static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct seg6_hmac_info *hinfo = obj; return (hinfo->hmackeyid != *(__u32 *)arg->key); } static inline void seg6_hinfo_release(struct seg6_hmac_info *hinfo) { kfree_rcu(hinfo, rcu); } static void seg6_free_hi(void *ptr, void *arg) { struct seg6_hmac_info *hinfo = (struct seg6_hmac_info *)ptr; if (hinfo) seg6_hinfo_release(hinfo); } static const struct rhashtable_params rht_params = { .head_offset = offsetof(struct seg6_hmac_info, node), .key_offset = offsetof(struct seg6_hmac_info, hmackeyid), .key_len = sizeof(u32), .automatic_shrinking = true, .obj_cmpfn = seg6_hmac_cmpfn, }; static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) { struct sr6_tlv_hmac *tlv; if (srh->hdrlen < (srh->first_segment + 1) * 2 + 5) return NULL; if (!sr_has_hmac(srh)) return NULL; tlv = (struct sr6_tlv_hmac *) ((char *)srh + ((srh->hdrlen + 1) << 3) - 40); if (tlv->tlvhdr.type != SR6_TLV_HMAC || tlv->tlvhdr.len != 38) return NULL; return tlv; } int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output) { __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); int plen, i, ret = 0; char *ring, *off; /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; /* this limit allows for 14 segments */ if (plen >= SEG6_HMAC_RING_SIZE) return -EMSGSIZE; /* Let's build the HMAC text on the ring buffer. The text is composed * as follows, in order: * * 1. Source IPv6 address (128 bits) * 2. first_segment value (8 bits) * 3. Flags (8 bits) * 4. HMAC Key ID (32 bits) * 5. All segments in the segments list (n * 128 bits) */ local_bh_disable(); local_lock_nested_bh(&hmac_storage.bh_lock); ring = this_cpu_ptr(hmac_storage.hmac_ring); off = ring; /* source address */ memcpy(off, saddr, 16); off += 16; /* first_segment value */ *off++ = hdr->first_segment; /* flags */ *off++ = hdr->flags; /* HMAC Key ID */ memcpy(off, &hmackeyid, 4); off += 4; /* all segments in the list */ for (i = 0; i < hdr->first_segment + 1; i++) { memcpy(off, hdr->segments + i, 16); off += 16; } switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: hmac_sha1(&hinfo->key.sha1, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); memset(&output[SHA1_DIGEST_SIZE], 0, SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); break; case SEG6_HMAC_ALGO_SHA256: hmac_sha256(&hinfo->key.sha256, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); break; default: WARN_ON_ONCE(1); ret = -EINVAL; break; } local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); return ret; } EXPORT_SYMBOL(seg6_hmac_compute); /* checks if an incoming SR-enabled packet's HMAC status matches * the incoming policy. * * called with rcu_read_lock() */ bool seg6_hmac_validate_skb(struct sk_buff *skb) { u8 hmac_output[SEG6_HMAC_FIELD_LEN]; struct net *net = dev_net(skb->dev); struct seg6_hmac_info *hinfo; struct sr6_tlv_hmac *tlv; struct ipv6_sr_hdr *srh; struct inet6_dev *idev; int require_hmac; idev = __in6_dev_get(skb->dev); srh = (struct ipv6_sr_hdr *)skb_transport_header(skb); tlv = seg6_get_tlv_hmac(srh); require_hmac = READ_ONCE(idev->cnf.seg6_require_hmac); /* mandatory check but no tlv */ if (require_hmac > 0 && !tlv) return false; /* no check */ if (require_hmac < 0) return true; /* check only if present */ if (require_hmac == 0 && !tlv) return true; /* now, seg6_require_hmac >= 0 && tlv */ hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid)); if (!hinfo) return false; if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output)) return false; if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN)) return false; return true; } EXPORT_SYMBOL(seg6_hmac_validate_skb); /* called with rcu_read_lock() */ struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct seg6_hmac_info *hinfo; hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params); return hinfo; } EXPORT_SYMBOL(seg6_hmac_info_lookup); int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) { struct seg6_pernet_data *sdata = seg6_pernet(net); int err; switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: hmac_sha1_preparekey(&hinfo->key.sha1, hinfo->secret, hinfo->slen); break; case SEG6_HMAC_ALGO_SHA256: hmac_sha256_preparekey(&hinfo->key.sha256, hinfo->secret, hinfo->slen); break; default: return -EINVAL; } err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); return err; } EXPORT_SYMBOL(seg6_hmac_info_add); int seg6_hmac_info_del(struct net *net, u32 key) { struct seg6_pernet_data *sdata = seg6_pernet(net); struct seg6_hmac_info *hinfo; int err = -ENOENT; hinfo = rhashtable_lookup_fast(&sdata->hmac_infos, &key, rht_params); if (!hinfo) goto out; err = rhashtable_remove_fast(&sdata->hmac_infos, &hinfo->node, rht_params); if (err) goto out; seg6_hinfo_release(hinfo); out: return err; } EXPORT_SYMBOL(seg6_hmac_info_del); int seg6_push_hmac(struct net *net, struct in6_addr *saddr, struct ipv6_sr_hdr *srh) { struct seg6_hmac_info *hinfo; struct sr6_tlv_hmac *tlv; int err = -ENOENT; tlv = seg6_get_tlv_hmac(srh); if (!tlv) return -EINVAL; rcu_read_lock(); hinfo = seg6_hmac_info_lookup(net, be32_to_cpu(tlv->hmackeyid)); if (!hinfo) goto out; memset(tlv->hmac, 0, SEG6_HMAC_FIELD_LEN); err = seg6_hmac_compute(hinfo, srh, saddr, tlv->hmac); out: rcu_read_unlock(); return err; } EXPORT_SYMBOL(seg6_push_hmac); int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); return rhashtable_init(&sdata->hmac_infos, &rht_params); } void __net_exit seg6_hmac_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); rhashtable_free_and_destroy(&sdata->hmac_infos, seg6_free_hi, NULL); } EXPORT_SYMBOL(seg6_hmac_net_exit);
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 137 137 137 162 1 1 1 1 1 1 1 1 4 2 2 2 2 4 56 56 56 57 56 57 57 1 2 1 2 2 3 3 3 3 3 8 9 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 2 1 1 1 1 1 1 3 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 2 2 2 2 3 2 2 1 27 27 6 6 6 5 3 3 3 3 3 3 3 6 6 6 6 6 2 2 2 133 134 134 20 20 20 20 84 3 3 3 3 3 3 77 75 8 8 3 3 78 3 68 40 40 68 69 70 37 71 130 128 128 6 6 6 6 6 6 6 34 140 120 133 69 1 1 1 70 70 70 70 69 39 30 31 93 134 134 133 238 239 212 240 11 11 10 11 11 11 30 30 29 29 28 11 17 30 30 30 26 27 27 27 26 27 29 8 8 8 40 40 104 175 173 175 175 172 152 123 121 122 1 122 1 120 9 117 120 18 105 1 1 104 102 174 50 22 6 48 168 168 102 104 104 81 104 102 18 101 39 37 40 104 18 1 1 8 205 31 31 31 28 1 1 1 30 173 203 206 206 206 30 2 142 141 142 138 35 141 137 110 109 109 104 32 31 105 64 34 108 77 48 50 47 48 48 134 134 130 42 23 21 42 32 32 134 161 161 160 161 164 91 89 86 33 33 61 61 136 19 17 17 14 5 4 13 4 4 1 129 28 26 28 107 32 8 8 2 8 26 78 67 51 51 67 66 21 143 163 125 126 96 3 3 3 3 3 1 3 3 3 1 3 3 3 3 3 3 2 3 3 3 3 3 3 3 1 3 3 3 3 3 3 2 2 4 4 4 4 4 4 4 2 4 4 6 6 4 1 1 11 1 12 11 8 3 3 3 3 1 2 2 2 1 1 1 1 1 12 12 8 7 7 7 7 7 1 6 7 3 6 6 1 6 6 6 6 1 6 6 6 1 5 2 4 4 2 4 1 4 4 1 1 1 3 3 3 4 77 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Alan Cox : Verify area fixes. * Alan Cox : cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox : Use __u32 properly * Alan Cox : Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Split to fib.c and * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Vitaly E. Lavrov : Race condition in ip_route_input_slow. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. * Vladimir V. Ivanov : IP rule info (flowid) is really useful. * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect * Ilia Sotnikov : Removed TOS from hash calculations */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/memblock.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/jhash.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/flow.h> #include <net/inet_dscp.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/route.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/secure_seq.h> #include <net/ip_tunnels.h> #include "fib_lookup.h" #define RT_GC_TIMEOUT (300*HZ) #define DEFAULT_MIN_PMTU (512 + 20 + 20) #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ) #define DEFAULT_MIN_ADVMSS 256 static int ip_rt_max_size; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost __read_mostly = HZ; static int ip_rt_error_burst __read_mostly = 5 * HZ; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { WARN_ON(1); return NULL; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, .confirm_neigh = ipv4_confirm_neigh, }; #define ECN_OR_COST(class) TC_PRIO_##class const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #ifndef CONFIG_PREEMPT_RT #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #else #define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field) #endif #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos) return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { } static int rt_cache_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); return 0; } static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, .show = rt_cache_seq_show, }; static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) { int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } return NULL; } static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } (*pos)++; return NULL; } static void rt_cpu_seq_stop(struct seq_file *seq, void *v) { } static int rt_cpu_seq_show(struct seq_file *seq, void *v) { struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x\n", dst_entries_get_slow(&ipv4_dst_ops), 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, st->in_brd, st->in_martian_dst, st->in_martian_src, 0, /* st->out_hit */ st->out_slow_tot, st->out_slow_mc, 0, /* st->gc_total */ 0, /* st->gc_ignored */ 0, /* st->gc_goal_miss */ 0, /* st->gc_dst_overflow */ 0, /* st->in_hlist_search */ 0 /* st->out_hlist_search */ ); return 0; } static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, .show = rt_cpu_seq_show, }; #ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; unsigned int i, j; dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); if (!dst) return -ENOMEM; for_each_possible_cpu(i) { src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); for (j = 0; j < 256; j++) { dst[j].o_bytes += src[j].o_bytes; dst[j].o_packets += src[j].o_packets; dst[j].i_bytes += src[j].i_bytes; dst[j].i_packets += src[j].i_packets; } } seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); kfree(dst); return 0; } #endif static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; pde = proc_create_seq("rt_cache", 0444, net->proc_net, &rt_cache_seq_ops); if (!pde) goto err1; pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat, &rt_cpu_seq_ops); if (!pde) goto err2; #ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create_single("rt_acct", 0, net->proc_net, rt_acct_proc_show); if (!pde) goto err3; #endif return 0; #ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif err2: remove_proc_entry("rt_cache", net->proc_net); err1: return -ENOMEM; } static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); #ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { .init = ip_rt_do_proc_init, .exit = ip_rt_do_proc_exit, }; static int __init ip_rt_proc_init(void) { return register_pernet_subsys(&ip_rt_proc_ops); } #else static inline int ip_rt_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ static inline bool rt_is_expired(const struct rtable *rth) { bool res; rcu_read_lock(); res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); rcu_read_unlock(); return res; } void rt_cache_flush(struct net *net) { rt_genid_bump_ipv4(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev; struct neighbour *n; rcu_read_lock(); dev = dst_dev_rcu(dst); if (likely(rt->rt_gw_family == AF_INET)) { n = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { n = ip_neigh_gw6(dev, &rt->rt_gw6); } else { __be32 pkey; pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); n = ip_neigh_gw4(dev, pkey); } if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst_dev(dst); const __be32 *pkey = daddr; if (rt->rt_gw_family == AF_INET) { pkey = (const __be32 *)&rt->rt_gw4; } else if (rt->rt_gw_family == AF_INET6) { return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); } else if (!daddr || (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } /* Hash tables of size 2048..262144 depending on RAM size. * Each bucket uses 8 bytes. */ static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker * to infer how many packets were sent between two points in time. */ static u32 ip_idents_reserve(u32 hash, int segs) { u32 bucket, old, now = (u32)jiffies; atomic_t *p_id; u32 *p_tstamp; u32 delta = 0; bucket = hash & ip_idents_mask; p_tstamp = ip_tstamps + bucket; p_id = ip_idents + bucket; old = READ_ONCE(*p_tstamp); if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = get_random_u32_below(now - old); /* If UBSAN reports an error there, please make sure your compiler * supports -fno-strict-overflow before reporting it that was a bug * in UBSAN, and it has been fixed in GCC-8. */ return atomic_add_return(segs + delta, p_id) - segs; } void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) { u32 hash, id; /* Note the following code is not safe, but this is okay. */ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) get_random_bytes(&net->ipv4.ip_id_key, sizeof(net->ipv4.ip_id_key)); hash = siphash_3u32((__force u32)iph->daddr, (__force u32)iph->saddr, iph->protocol, &net->ipv4.ip_id_key); id = ip_idents_reserve(hash, segs); iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, __u8 tos, u8 prot, u32 mark, int flow_flags) { __u8 scope = RT_SCOPE_UNIVERSE; if (sk) { oif = sk->sk_bound_dev_if; mark = READ_ONCE(sk->sk_mark); tos = ip_sock_rt_tos(sk); scope = ip_sock_rt_scope(sk); prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol; } flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope, prot, flow_flags, iph->daddr, iph->saddr, 0, 0, sock_net_uid(net, sk)); } static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, const struct sock *sk) { const struct net *net = dev_net(skb->dev); const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; u8 prot = iph->protocol; u32 mark = skb->mark; __u8 tos = iph->tos; __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, 0, 0, sk_uid(sk)); rcu_read_unlock(); } static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct sk_buff *skb) { if (skb) build_skb_flow_key(fl4, skb, sk); else build_sk_flow_key(fl4, sk); } static DEFINE_SPINLOCK(fnhe_lock); static void fnhe_flush_routes(struct fib_nh_exception *fnhe) { struct rtable *rt; rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } } static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { struct fib_nh_exception __rcu **fnhe_p, **oldest_p; struct fib_nh_exception *fnhe, *oldest = NULL; for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); if (!fnhe) break; if (!oldest || time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; oldest_p = fnhe_p; } } /* Clear oldest->fnhe_daddr to prevent this fnhe from being * rebound with new dsts in rt_bind_exception(). */ oldest->fnhe_daddr = 0; fnhe_flush_routes(oldest); *oldest_p = oldest->fnhe_next; kfree_rcu(oldest, rcu); } static u32 fnhe_hashfun(__be32 daddr) { static siphash_aligned_key_t fnhe_hash_key; u64 hval; net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key)); hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key); return hash_64(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) { rt->rt_pmtu = fnhe->fnhe_pmtu; rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; rt->dst.expires = fnhe->fnhe_expires; if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } } static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, __be32 gw, u32 pmtu, bool lock, unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; u32 genid, hval; unsigned int i; int depth; genid = fnhe_genid(dev_net(nhc->nhc_dev)); hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference(nhc->nhc_exceptions); if (!hash) { hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; rcu_assign_pointer(nhc->nhc_exceptions, hash); } hash += hval; depth = 0; for (fnhe = rcu_dereference(hash->chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) break; depth++; } if (fnhe) { if (fnhe->fnhe_genid != genid) fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; if (pmtu) { fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; } fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) fill_route_from_fnhe(rt, fnhe); rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) fill_route_from_fnhe(rt, fnhe); } else { /* Randomize max depth to avoid some side channels attacks. */ int max_depth = FNHE_RECLAIM_DEPTH + get_random_u32_below(FNHE_RECLAIM_DEPTH); while (depth > max_depth) { fnhe_remove_oldest(hash); depth--; } fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); if (!fnhe) goto out_unlock; fnhe->fnhe_next = hash->chain; fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = max(1UL, expires); rcu_assign_pointer(hash->chain, fnhe); /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. */ rt = rcu_dereference(nhc->nhc_rth_input); if (rt) WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); for_each_possible_cpu(i) { struct rtable __rcu **prt; prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); } } fnhe->fnhe_stamp = jiffies; out_unlock: spin_unlock_bh(&fnhe_lock); } static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, bool kill_route) { __be32 new_gw = icmp_hdr(skb)->un.gateway; __be32 old_gw = ip_hdr(skb)->saddr; struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct neighbour *n; struct net *net; switch (icmp_hdr(skb)->code & 7) { case ICMP_REDIR_NET: case ICMP_REDIR_NETTOS: case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: break; default: return; } if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; net = dev_net(dev); if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw); if (!n) n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { if (!(READ_ONCE(n->nud_state) & NUD_VALID)) { neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, skb); nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); } if (kill_route) WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } neigh_release(n); } return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) { const struct iphdr *iph = (const struct iphdr *) skb->data; __be32 daddr = iph->daddr; __be32 saddr = iph->saddr; net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" " Advised path = %pI4 -> %pI4\n", &old_gw, dev->name, &new_gw, &saddr, &daddr); } #endif ; } static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct rtable *rt; struct flowi4 fl4; const struct iphdr *iph = (const struct iphdr *) skb->data; struct net *net = dev_net(skb->dev); int oif = skb->dev->ifindex; u8 prot = iph->protocol; u32 mark = skb->mark; __u8 tos = iph->tos; rt = dst_rtable(dst); __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst) { struct rtable *rt = dst_rtable(dst); if ((READ_ONCE(dst->obsolete) > 0) || (rt->rt_flags & RTCF_REDIRECTED) || READ_ONCE(rt->dst.expires)) sk_dst_reset(sk); } /* * Algorithm: * 1. The first ip_rt_redirect_number redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. * 2. If we did not see packets requiring redirects * during ip_rt_redirect_silence, we assume that the host * forgot redirected route and start to send redirects again. * * This algorithm is much cheaper and more intelligent than dumb load limiting * in icmp.c. * * NOTE. Do not forget to inhibit load limiting for redirects (redundant) * and "frag. need" (breaks PMTU discovery) in icmp.c. */ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; struct inet_peer *peer; struct net *net; int log_martians; int vif; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); net = dev_net(rt->dst.dev); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { rcu_read_unlock(); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; } /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { peer->rate_tokens = 0; peer->n_redirects = 0; } /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; goto out_unlock; } /* Check for load limit; set rate_last to the latest sent * redirect. */ if (peer->n_redirects == 0 || time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->n_redirects)))) { __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->n_redirects; if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians && peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } out_unlock: rcu_read_unlock(); } static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = skb->dev; struct in_device *in_dev; struct inet_peer *peer; unsigned long now; struct net *net; SKB_DR(reason); bool send; int code; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) goto out; } in_dev = __in_dev_get_rcu(dev); /* IP on this device is disabled. */ if (!in_dev) goto out; net = dev_net(rt->dst.dev); if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { case EHOSTUNREACH: SKB_DR_SET(reason, IP_INADDRERRORS); __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); break; case ENETUNREACH: SKB_DR_SET(reason, IP_INNOROUTES); __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; } goto out; } switch (rt->dst.error) { case EINVAL: default: goto out; case EHOSTUNREACH: code = ICMP_HOST_UNREACH; break; case ENETUNREACH: code = ICMP_NET_UNREACH; SKB_DR_SET(reason, IP_INNOROUTES); __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; break; } rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, l3mdev_master_ifindex_rcu(skb->dev)); send = true; if (peer) { now = jiffies; peer->rate_tokens += now - peer->rate_last; if (peer->rate_tokens > ip_rt_error_burst) peer->rate_tokens = ip_rt_error_burst; peer->rate_last = now; if (peer->rate_tokens >= ip_rt_error_cost) peer->rate_tokens -= ip_rt_error_cost; else send = false; } rcu_read_unlock(); if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb_reason(skb, reason); return 0; } static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct fib_result res; bool lock = false; struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) return; old_mtu = ipv4_mtu(dst); if (old_mtu < mtu) return; rcu_read_lock(); net = dst_dev_net_rcu(dst); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); } if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, READ_ONCE(dst->expires) - net->ipv4.ip_rt_mtu_expires / 2)) goto out; if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, NULL); #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fib_info_num_path(res.fi) > 1) { int nhsel; for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) { nhc = fib_info_nhc(res.fi, nhsel); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } goto out; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } out: rcu_read_unlock(); } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct rtable *rt = dst_rtable(dst); struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); /* Don't make lookup fail for bridged encapsulations */ if (skb && netif_is_any_bridge_port(skb->dev)) fl4.flowi4_oif = 0; __ip_rt_update_pmtu(rt, &fl4, mtu); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; u32 mark = IP4_REPLY_MARK(net, skb->mark); __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_update_pmtu); static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); if (!fl4.flowi4_mark) fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct dst_entry *odst = NULL; bool new = false; struct net *net = sock_net(sk); bh_lock_sock(sk); if (!ip_sk_accept_pmtu(sk)) goto out; odst = sk_dst_get(sk); if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = dst_rtable(odst); if (READ_ONCE(odst->obsolete) && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } if (new) sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct net *net = sock_net(sk); __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = dst_rtable(dst); /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. * * When a PMTU/redirect information update invalidates a route, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or * DST_OBSOLETE_DEAD. */ if (READ_ONCE(dst->obsolete) != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) return NULL; return dst; } EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { struct inet_skb_parm parm; struct net_device *dev; int res; /* Recompile ip options since IPCB may not be valid anymore. * Also check we have a reasonable ipv4 header. */ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; memset(&parm, 0, sizeof(parm)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; } __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm); } static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; ipv4_send_dest_unreach(skb); rt = skb_rtable(skb); if (rt) dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); WARN_ON(1); return 0; } /* * We do not cache source address of outgoing interface, * because it is used only by IP RR, TS and SRR options, * so that it out of fast path. * * BTW remember: "addr" is allowed to be not aligned * in IP options! */ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; if (rt_is_output_route(rt)) src = ip_hdr(skb)->saddr; else { struct fib_result res; struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); else src = inet_select_addr(rt->dst.dev, rt_nexthop(rt, iph->daddr), RT_SCOPE_UNIVERSE); rcu_read_unlock(); } memcpy(addr, &src, 4); } #ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) rt->dst.tclassid |= tag & 0xFFFF; if (!(rt->dst.tclassid & 0xFFFF0000)) rt->dst.tclassid |= tag & 0xFFFF0000; } #endif static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); unsigned int advmss; struct net *net; rcu_read_lock(); net = dst_dev_net_rcu(dst); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); return min(advmss, IPV4_MAX_PMTU - header_size); } INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { return ip_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ipv4_mtu); static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe, __rcu **fnhe_p; u32 hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock)); hash += hval; fnhe_p = &hash->chain; fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); while (fnhe) { if (fnhe->fnhe_daddr == daddr) { rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); /* set fnhe_daddr to 0 to ensure it won't bind with * new dsts in rt_bind_exception(). */ fnhe->fnhe_daddr = 0; fnhe_flush_routes(fnhe); kfree_rcu(fnhe, rcu); break; } fnhe_p = &fnhe->fnhe_next; fnhe = rcu_dereference_protected(fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)); } spin_unlock_bh(&fnhe_lock); } static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); struct fib_nh_exception *fnhe; u32 hval; if (!hash) return NULL; hval = fnhe_hashfun(daddr); for (fnhe = rcu_dereference(hash[hval].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) { if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) { ip_del_fnhe(nhc, daddr); break; } return fnhe; } } return NULL; } /* MTU selection: * 1. mtu on route is locked - use it * 2. mtu from nexthop exception * 3. mtu from egress device */ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) { struct fib_nh_common *nhc = res->nhc; struct net_device *dev = nhc->nhc_dev; struct fib_info *fi = res->fi; u32 mtu = 0; if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) || fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) mtu = fi->fib_mtu; if (likely(!mtu)) { struct fib_nh_exception *fnhe; fnhe = find_exception(nhc, daddr); if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) mtu = fnhe->fnhe_pmtu; } if (likely(!mtu)) mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr, const bool do_cache) { bool ret = false; spin_lock_bh(&fnhe_lock); if (daddr == fnhe->fnhe_daddr) { struct rtable __rcu **porig; struct rtable *orig; int genid = fnhe_genid(dev_net(rt->dst.dev)); if (rt_is_input_route(rt)) porig = &fnhe->fnhe_rth_input; else porig = &fnhe->fnhe_rth_output; orig = rcu_dereference(*porig); if (fnhe->fnhe_genid != genid) { fnhe->fnhe_genid = genid; fnhe->fnhe_gw = 0; fnhe->fnhe_pmtu = 0; fnhe->fnhe_expires = 0; fnhe->fnhe_mtu_locked = false; fnhe_flush_routes(fnhe); orig = NULL; } fill_route_from_fnhe(rt, fnhe); if (!rt->rt_gw4) { rt->rt_gw4 = daddr; rt->rt_gw_family = AF_INET; } if (do_cache) { dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); if (orig) { dst_dev_put(&orig->dst); dst_release(&orig->dst); } ret = true; } fnhe->fnhe_stamp = jiffies; } spin_unlock_bh(&fnhe_lock); return ret; } static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) { struct rtable *orig, *prev, **p; bool ret = true; if (rt_is_input_route(rt)) { p = (struct rtable **)&nhc->nhc_rth_input; } else { p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } orig = *p; /* hold dst before doing cmpxchg() to avoid race condition * on this dst */ dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { dst_release(&rt->dst); ret = false; } return ret; } struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); void rt_add_uncached_list(struct rtable *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); rt->dst.rt_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->dst.rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt_del_uncached_list(struct rtable *rt) { if (!list_empty(&rt->dst.rt_uncached)) { struct uncached_list *ul = rt->dst.rt_uncached_list; spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } static void ipv4_dst_destroy(struct dst_entry *dst) { ip_dst_metrics_put(dst); rt_del_uncached_list(dst_rtable(dst)); } void rt_flush_dev(struct net_device *dev) { struct rtable *rt, *safe; int cpu; for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); if (list_empty(&ul->head)) continue; spin_lock_bh(&ul->lock); list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { if (rt->dst.dev != dev) continue; rt->dst.dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } } static bool rt_cache_valid(const struct rtable *rt) { return rt && READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK && !rt_is_expired(rt); } static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag, const bool do_cache) { bool cached = false; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) rt->rt_gw4 = nhc->nhc_gw.ipv4; else rt->rt_gw6 = nhc->nhc_gw.ipv6; } ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); rt->dst.tclassid = nh->nh_tclassid; } #endif rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr, do_cache); else if (do_cache) cached = rt_cache_route(nhc, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or * FIB nexthop have the DST_NOCACHE bit clear. * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ if (!rt->rt_gw4) { rt->rt_gw_family = AF_INET; rt->rt_gw4 = daddr; } rt_add_uncached_list(rt); } } else rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif } struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, bool noxfrm) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, (noxfrm ? DST_NOXFRM : 0)); if (rt) { rt->rt_genid = rt_genid_ipv4(dev_net(dev)); rt->rt_flags = flags; rt->rt_type = type; rt->rt_is_input = 0; rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; rt->dst.output = ip_output; if (flags & RTCF_LOCAL) rt->dst.input = ip_local_deliver; } return rt; } EXPORT_SYMBOL(rt_dst_alloc); struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) { struct rtable *new_rt; new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, rt->dst.flags); if (new_rt) { new_rt->rt_genid = rt_genid_ipv4(dev_net(dev)); new_rt->rt_flags = rt->rt_flags; new_rt->rt_type = rt->rt_type; new_rt->rt_is_input = rt->rt_is_input; new_rt->rt_iif = rt->rt_iif; new_rt->rt_pmtu = rt->rt_pmtu; new_rt->rt_mtu_locked = rt->rt_mtu_locked; new_rt->rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) new_rt->rt_gw4 = rt->rt_gw4; else if (rt->rt_gw_family == AF_INET6) new_rt->rt_gw6 = rt->rt_gw6; new_rt->dst.input = READ_ONCE(rt->dst.input); new_rt->dst.output = READ_ONCE(rt->dst.output); new_rt->dst.error = rt->dst.error; new_rt->dst.lastuse = jiffies; new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); } return new_rt; } EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag) { enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) return SKB_DROP_REASON_NOT_SPECIFIED; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) return SKB_DROP_REASON_IP_INVALID_SOURCE; if (skb->protocol != htons(ETH_P_IP)) return SKB_DROP_REASON_INVALID_PROTO; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) return SKB_DROP_REASON_IP_LOCALNET; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr) && ip_hdr(skb)->protocol != IPPROTO_IGMP) return SKB_DROP_REASON_IP_INVALID_SOURCE; } else { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, itag); if (reason) return reason; } return SKB_NOT_DROPPED_YET; } /* called in rcu_read_lock() section */ static enum skb_drop_reason ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, int our) { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; enum skb_drop_reason reason; struct rtable *rth; u32 itag = 0; reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, &itag); if (reason) return reason; if (our) flags |= RTCF_LOCAL; if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, false); if (!rth) return SKB_DROP_REASON_NOMEM; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; rth->rt_is_input= 1; #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); return SKB_NOT_DROPPED_YET; } static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, __be32 daddr, __be32 saddr) { RT_CACHE_STAT_INC(in_martian_src); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { /* * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ pr_warn("martian source %pI4 from %pI4, on dev %s\n", &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), dev->hard_header_len, false); } } #endif } /* called in rcu_read_lock() section */ static enum skb_drop_reason __mkroute_input(struct sk_buff *skb, const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, dscp_t dscp) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; bool do_cache; u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return reason; } err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { reason = -err; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); goto cleanup; } do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && skb->protocol == htons(ETH_P_IP)) { __be32 gw; gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; if (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, gw)) IPCB(skb)->flags |= IPSKB_DOREDIRECT; } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. * * Proxy arp feature have been extended to allow, ARP * replies back to the same interface, to support * Private VLAN switch technologies. See arp.c. */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE; goto cleanup; } } if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; } } rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { reason = SKB_DROP_REASON_NOMEM; goto cleanup; } rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: reason = SKB_NOT_DROPPED_YET; cleanup: return reason; } #ifdef CONFIG_IP_ROUTE_MULTIPATH /* To make ICMP packets follow the right flow, the multipath hash is * calculated from the inner IP addresses. */ static void ip_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); const struct iphdr *key_iph = outer_iph; const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; struct icmphdr _icmph; if (likely(outer_iph->protocol != IPPROTO_ICMP)) goto out; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) goto out; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) goto out; if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) goto out; key_iph = inner_iph; out: hash_keys->addrs.v4addrs.src = key_iph->saddr; hash_keys->addrs.v4addrs.dst = key_iph->daddr; } static u32 fib_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again. */ if (!has_inner) return 0; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, 0); if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) return 0; if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_skb(const struct net *net, const struct sk_buff *skb) { u32 mhash, mhash_inner; bool has_inner = true; mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner); mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner); return jhash_2words(mhash, mhash_inner, 0); } static u32 fib_multipath_custom_hash_fl4(const struct net *net, const struct flowi4 *fl4) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = fl4->saddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl4->fl4_sport; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; u32 mhash = 0; switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (skb) { ip_multipath_l3_keys(skb, &hash_keys); } else { hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: /* skb is currently provided only when forwarding */ if (skb) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; struct flow_keys keys; /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; hash_keys.ports.src = flkeys->ports.src; hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); /* skb is currently provided only when forwarding */ if (skb) { struct flow_keys keys; skb_flow_dissect_flow_keys(skb, &keys, 0); /* Inner can be v4 or v6 */ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; hash_keys.tags.flow_label = keys.tags.flow_label; hash_keys.basic.ip_proto = keys.basic.ip_proto; } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; ip_multipath_l3_keys(skb, &hash_keys); } } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) mhash = fib_multipath_custom_hash_skb(net, skb); else mhash = fib_multipath_custom_hash_fl4(net, fl4); break; } if (multipath_hash) mhash = jhash_2words(mhash, multipath_hash, 0); return mhash >> 1; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static enum skb_drop_reason ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h, NULL); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif /* create a routing cache entry */ return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp); } /* Implements all the saddr-related checks as ip_route_input_slow(), * assuming daddr is valid and the destination is not a local broadcast one. * Uses the provided hint instead of performing a route lookup. */ enum skb_drop_reason ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); u32 tag = 0; if (!in_dev) return reason; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_zeronet(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; } if (!(rt->rt_flags & RTCF_LOCAL)) goto skip_validate_source; reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, in_dev, &tag); if (reason) goto martian_source; skip_validate_source: skb_dst_copy(skb, hint); return SKB_NOT_DROPPED_YET; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); return reason; } /* get device for dst_alloc with local routes */ static struct net_device *ip_rt_get_dev(struct net *net, const struct fib_result *res) { struct fib_nh_common *nhc = res->fi ? res->nhc : NULL; struct net_device *dev = NULL; if (nhc) dev = l3mdev_master_dev_rcu(nhc->nhc_dev); return dev ? : net->loopback_dev; } /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. * Changes in the enforced policies must be applied also to * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. * called with rcu_read_lock() */ static enum skb_drop_reason ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flow_keys *flkeys = NULL, _flkeys; struct net *net = dev_net(dev); struct ip_tunnel_info *tun_info; int err = -EINVAL; unsigned int flags = 0; u32 itag = 0; struct rtable *rth; struct flowi4 fl4; bool do_cache = true; /* IP on this device is disabled. */ if (!in_dev) goto out; /* Check for the most weird martians, which can be not detected * by fib_lookup. */ tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } res->fi = NULL; res->table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ if (ipv4_is_zeronet(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_zeronet(daddr)) { reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; } /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), * and call it once if daddr or/and saddr are loopback addresses */ if (ipv4_is_loopback(daddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_destination; } } else if (ipv4_is_loopback(saddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; } } /* * Now we are ready to route packet. */ fl4.flowi4_l3mdev = 0; fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; } else { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; goto no_route; } if (res->type == RTN_BROADCAST) { if (IN_DEV_BFORWARD(in_dev)) goto make_route; /* not do cache if bc_forwarding is enabled */ if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING)) do_cache = false; goto brd_input; } err = -EINVAL; if (res->type == RTN_LOCAL) { reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, in_dev, &itag); if (reason) goto martian_source; goto local_input; } if (!IN_DEV_FORWARD(in_dev)) { err = -EHOSTUNREACH; goto no_route; } if (res->type != RTN_UNICAST) { reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; } make_route: reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); out: return reason; brd_input: if (skb->protocol != htons(ETH_P_IP)) { reason = SKB_DROP_REASON_INVALID_PROTO; goto out; } if (!ipv4_is_zeronet(saddr)) { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, &itag); if (reason) goto martian_source; } flags |= RTCF_BROADCAST; res->type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; do_cache &= res->fi && !itag; if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); reason = SKB_NOT_DROPPED_YET; goto out; } } rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, false); if (!rth) goto e_nobufs; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { WARN_ON(rth->dst.input == lwtunnel_input); rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } if (unlikely(!rt_cache_route(nhc, rth))) rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); reason = SKB_NOT_DROPPED_YET; goto out; no_route: RT_CACHE_STAT_INC(in_no_route); res->type = RTN_UNREACHABLE; res->fi = NULL; res->table = NULL; goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", &daddr, &saddr, dev->name); #endif goto out; e_nobufs: reason = SKB_DROP_REASON_NOMEM; goto out; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } /* called with rcu_read_lock held */ static enum skb_drop_reason ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing * hardware multicast filters :-( As result the host on multicasting * network acquires a lot of useless route cache entries, sort of * SDR messages from all the world. Now we try to get rid of them. * Really, provided software IP multicast filter is organized * reasonably (at least, hashed), it does not result in a slowdown * comparing with route cache reject entries. * Note, that multicast routers are not affected, because * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; if (!in_dev) return reason; our = ip_check_mc_rcu(in_dev, daddr, saddr, ip_hdr(skb)->protocol); /* check l3 master if no match yet */ if (!our && netif_is_l3_slave(dev)) { struct in_device *l3_in_dev; l3_in_dev = __in_dev_get_rcu(skb->dev); if (l3_in_dev) our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, ip_hdr(skb)->protocol); } if (our #ifdef CONFIG_IP_MROUTE || (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ) { reason = ip_route_input_mc(skb, daddr, saddr, dscp, dev, our); } return reason; } return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev) { enum skb_drop_reason reason; struct fib_result res; rcu_read_lock(); reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); return reason; } EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, int orig_oif, struct net_device *dev_out, unsigned int flags) { struct fib_info *fi = res->fi; struct fib_nh_exception *fnhe; struct in_device *in_dev; u16 type = res->type; struct rtable *rth; bool do_cache; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) return ERR_PTR(-EINVAL); if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl4->daddr)) { type = RTN_BROADCAST; /* reset fi to prevent gateway resolution */ fi = NULL; } else if (ipv4_is_multicast(fl4->daddr)) { type = RTN_MULTICAST; } else if (ipv4_is_zeronet(fl4->daddr)) { return ERR_PTR(-EINVAL); } if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; do_cache = true; if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; else do_cache = false; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. */ if (fi && res->prefixlen < 4) fi = NULL; } else if ((type == RTN_LOCAL) && (orig_oif != 0) && (orig_oif != dev_out->ifindex)) { /* For local routes that require a particular output interface * we do not want to cache the result. Caching the result * causes incorrect behaviour when there are multiple source * addresses on the interface, the end result being that if the * intended recipient is waiting on that interface for the * packet he won't receive it because it will be delivered on * the loopback interface and the IP_PKTINFO ipi_ifindex will * be set to the loopback interface as well. */ do_cache = false; } fnhe = NULL; do_cache &= fi != NULL; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct rtable __rcu **prth; fnhe = find_exception(nhc, fl4->daddr); if (!do_cache) goto add; if (fnhe) { prth = &fnhe->fnhe_rth_output; } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && !(nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; } prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } rth = rcu_dereference(*prth); if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) return rth; } add: rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_ORCONF(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); rth->rt_iif = orig_oif; RT_CACHE_STAT_INC(out_slow_tot); if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mr_output; } } #endif } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); lwtunnel_set_redirect(&rth->dst); return rth; } /* * Major route resolver routine. */ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, const struct sk_buff *skb) { struct fib_result res = { .type = RTN_UNSPEC, .fi = NULL, .table = NULL, .tclassid = 0, }; struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); rcu_read_unlock(); return rth; } EXPORT_SYMBOL_GPL(ip_route_output_key_hash); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, struct fib_result *res, const struct sk_buff *skb) { struct net_device *dev_out = NULL; int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; int err; if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. * It was wrong for two reasons: * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr * is assigned to multiple interfaces. * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK */ if (fl4->flowi4_oif == 0 && (ipv4_is_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = __ip_dev_find(net, fl4->saddr, false); if (!dev_out) goto out; /* Special hack: user can direct multicasts * and limited broadcast via necessary interface * without fiddling with IP_MULTICAST_IF or IP_PKTINFO. * This hack is not just for fun, it allows * vic,vat and friends to work. * They bind socket to loopback, set ttl to zero * and expect that it will work. * From the viewpoint of routing cache they are broken, * because we are not allowed to build multicast path * with loopback source addr (look, routing cache * cannot know, that ttl is zero, so that packet * will not leave this host and route is valid). * Luckily, this hack is good workaround. */ fl4->flowi4_oif = dev_out->ifindex; goto make_route; } if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ if (!__ip_dev_find(net, fl4->saddr, false)) goto out; } } if (fl4->flowi4_oif) { dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); rth = ERR_PTR(-ENODEV); if (!dev_out) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { rth = ERR_PTR(-ENETUNREACH); goto out; } if (ipv4_is_local_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr) || fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (!fl4->saddr) { if (ipv4_is_multicast(fl4->daddr)) fl4->saddr = inet_select_addr(dev_out, 0, fl4->flowi4_scope); else if (!fl4->daddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } if (!fl4->daddr) { fl4->daddr = fl4->saddr; if (!fl4->daddr) fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = LOOPBACK_IFINDEX; res->type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } err = fib_lookup(net, fl4, res, 0); if (err) { res->fi = NULL; res->table = NULL; if (fl4->flowi4_oif && (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) { /* Apparently, routing tables are wrong. Assume, * that the destination is on link. * * WHY? DW. * Because we are allowed to send to iface * even if it has NO routes and NO assigned * addresses. When oif is specified, routing * tables are looked up with only one purpose: * to catch if destination is gatewayed, rather than * direct. Moreover, if MSG_DONTROUTE is set, * we send packet, ignoring both routing tables * and ifaddr state. --ANK * * * We could make it even if oif is unknown, * likely IPv6, but we do not. */ if (fl4->saddr == 0) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res->type = RTN_UNICAST; goto make_route; } rth = ERR_PTR(err); goto out; } if (res->type == RTN_LOCAL) { if (!fl4->saddr) { if (res->fi->fib_prefsrc) fl4->saddr = res->fi->fib_prefsrc; else fl4->saddr = fl4->daddr; } /* L3 master device is the loopback for that domain */ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : net->loopback_dev; /* make sure orig_oif points to fib result device even * though packet rx/tx happens over loopback or l3mdev */ orig_oif = FIB_RES_OIF(*res); fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } fib_select_path(net, res, fl4, skb); dev_out = FIB_RES_DEV(*res); make_route: rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); out: return rth; } static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .default_advmss = ipv4_default_advmss, .neigh_lookup = ipv4_neigh_lookup, .check = dst_blackhole_check, .cow_metrics = dst_blackhole_cow_metrics, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rtable *ort = dst_rtable(dst_orig); struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_out; new->dev = net->loopback_dev; netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC); rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; rt->rt_mtu_locked = ort->rt_mtu_locked; rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; else if (rt->rt_gw_family == AF_INET6) rt->rt_gw6 = ort->rt_gw6; } dst_release(dst_orig); return rt ? &rt->dst : ERR_PTR(-ENOMEM); } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); if (IS_ERR(rt)) return rt; if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; rt = dst_rtable(xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0)); } return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, dscp_t dscp, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; u32 error; u32 metrics[RTAX_MAX]; nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = inet_dscp_to_dsfield(dscp); r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; if (IPCB(skb)->flags & IPSKB_DOREDIRECT) r->rtm_flags |= RTCF_DOREDIRECT; if (nla_put_in_addr(skb, RTA_DST, dst)) goto nla_put_failure; if (src) { r->rtm_src_len = 32; if (nla_put_in_addr(skb, RTA_SRC, src)) goto nla_put_failure; } if (rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) goto nla_put_failure; if (lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid && nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif if (fl4 && !rt_is_input_route(rt) && fl4->saddr != src) { if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } if (rt->rt_uses_gateway) { if (rt->rt_gw_family == AF_INET && nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; } else if (rt->rt_gw_family == AF_INET6) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) goto nla_put_failure; via = nla_data(nla); via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &rt->rt_gw6, alen); } } expires = READ_ONCE(rt->dst.expires); if (expires) { unsigned long now = jiffies; if (time_before(now, expires)) expires -= now; else expires = 0; } memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; if (rt->rt_mtu_locked && expires) metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (fl4) { if (fl4->flowi4_mark && nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && nla_put_u32(skb, RTA_UID, from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) goto nla_put_failure; if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, r, portid); if (err <= 0) { if (err == 0) return 0; goto nla_put_failure; } } else #endif if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) goto nla_put_failure; } } error = rt->dst.error; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fnhe_hash_bucket *bucket, int genid, int *fa_index, int fa_start, unsigned int flags) { int i; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; for (fnhe = rcu_dereference(bucket[i].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { struct rtable *rt; int err; if (*fa_index < fa_start) goto next; if (fnhe->fnhe_genid != genid) goto next; if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) goto next; rt = rcu_dereference(fnhe->fnhe_rth_input); if (!rt) rt = rcu_dereference(fnhe->fnhe_rth_output); if (!rt) goto next; err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, table_id, 0, NULL, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) return err; next: (*fa_index)++; } } return 0; } int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, int *fa_index, int fa_start, unsigned int flags) { struct net *net = sock_net(cb->skb->sk); int nhsel, genid = fnhe_genid(net); for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); struct fnhe_hash_bucket *bucket; int err; if (nhc->nhc_flags & RTNH_F_DEAD) continue; rcu_read_lock(); bucket = rcu_dereference(nhc->nhc_exceptions); err = 0; if (bucket) err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, genid, fa_index, fa_start, flags); rcu_read_unlock(); if (err) return err; } return 0; } static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, u8 ip_proto, __be16 sport, __be16 dport) { struct sk_buff *skb; struct iphdr *iph; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return NULL; /* Reserve room for dummy headers, this skb can pass * through good chunk of routing engine. */ skb_reset_mac_header(skb); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); iph = skb_put(skb, sizeof(struct iphdr)); iph->protocol = ip_proto; iph->saddr = src; iph->daddr = dst; iph->version = 0x4; iph->frag_off = 0; iph->ihl = 0x5; skb_set_transport_header(skb, skb->len); switch (iph->protocol) { case IPPROTO_UDP: { struct udphdr *udph; udph = skb_put_zero(skb, sizeof(struct udphdr)); udph->source = sport; udph->dest = dport; udph->len = htons(sizeof(struct udphdr)); udph->check = 0; break; } case IPPROTO_TCP: { struct tcphdr *tcph; tcph = skb_put_zero(skb, sizeof(struct tcphdr)); tcph->source = sport; tcph->dest = dport; tcph->doff = sizeof(struct tcphdr) / 4; tcph->rst = 1; tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), src, dst, 0); break; } case IPPROTO_ICMP: { struct icmphdr *icmph; icmph = skb_put_zero(skb, sizeof(struct icmphdr)); icmph->type = ICMP_ECHO; icmph->code = 0; } } return skb; } static int inet_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_NOTIFY | RTM_F_LOOKUP_TABLE | RTM_F_FIB_MATCH)) { NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_IIF: case RTA_OIF: case RTA_SRC: case RTA_DST: case RTA_IP_PROTO: case RTA_SPORT: case RTA_DPORT: case RTA_MARK: case RTA_UID: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); return -EINVAL; } } return 0; } static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; u32 table_id = RT_TABLE_MAIN; __be16 sport = 0, dport = 0; struct fib_result res = {}; u8 ip_proto = IPPROTO_UDP; struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; dscp_t dscp; kuid_t uid; u32 iif; int err; int mark; err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; rtm = nlmsg_data(nlh); src = nla_get_in_addr_default(tb[RTA_SRC], 0); dst = nla_get_in_addr_default(tb[RTA_DST], 0); iif = nla_get_u32_default(tb[RTA_IIF], 0); mark = nla_get_u32_default(tb[RTA_MARK], 0); dscp = inet_dsfield_to_dscp(rtm->rtm_tos); if (tb[RTA_UID]) uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else uid = (iif ? INVALID_UID : current_uid()); if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], &ip_proto, AF_INET, extack); if (err) return err; } if (tb[RTA_SPORT]) sport = nla_get_be16(tb[RTA_SPORT]); if (tb[RTA_DPORT]) dport = nla_get_be16(tb[RTA_DPORT]); skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); if (!skb) return -ENOBUFS; fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_dscp = dscp; fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; if (sport) fl4.fl4_sport = sport; if (dport) fl4.fl4_dport = dport; fl4.flowi4_proto = ip_proto; rcu_read_lock(); if (iif) { struct net_device *dev; dev = dev_get_by_index_rcu(net, iif); if (!dev) { err = -ENODEV; goto errout_rcu; } fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, dscp, dev, &res) ? -EINVAL : 0; rt = skb_rtable(skb); if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { fl4.flowi4_iif = LOOPBACK_IFINDEX; skb->dev = net->loopback_dev; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); else skb_dst_set(skb, &rt->dst); } if (err) goto errout_rcu; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = res.table ? res.table->tb_id : 0; /* reset skb for netlink reply msg */ skb_trim(skb, 0); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { struct fib_rt_info fri; if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } fri.fi = res.fi; fri.tb_id = table_id; fri.dst = res.prefix; fri.dst_len = res.prefixlen; fri.dscp = res.dscp; fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; fri.offload_failed = 0; if (res.fa_head) { struct fib_alias *fa; hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { u8 slen = 32 - fri.dst_len; if (fa->fa_slen == slen && fa->tb_id == fri.tb_id && fa->fa_dscp == fri.dscp && fa->fa_info == res.fi && fa->fa_type == fri.type) { fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); break; } } } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } if (err < 0) goto errout_rcu; rcu_read_unlock(); err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: return err; errout_rcu: rcu_read_unlock(); kfree_skb(skb); goto errout_free; } void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; if (write) { rt_cache_flush(net); fnhe_genid_bump(net); return 0; } return -EINVAL; } static struct ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ .procname = "gc_min_interval", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_min_interval_ms", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "gc_timeout", .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_interval", .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_number", .data = &ip_rt_redirect_number, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_silence", .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_cost", .data = &ip_rt_error_cost, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_burst", .data = &ip_rt_error_burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_elasticity", .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static const char ipv4_route_flush_procname[] = "flush"; static struct ctl_table ipv4_route_netns_table[] = { { .procname = ipv4_route_flush_procname, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, }, { .procname = "min_pmtu", .data = &init_net.ipv4.ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &ip_min_valid_pmtu, }, { .procname = "mtu_expires", .data = &init_net.ipv4.ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "min_adv_mss", .data = &init_net.ipv4.ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static __net_init int sysctl_route_net_init(struct net *net) { struct ctl_table *tbl; size_t table_size = ARRAY_SIZE(ipv4_route_netns_table); tbl = ipv4_route_netns_table; if (!net_eq(net, &init_net)) { int i; tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL); if (!tbl) goto err_dup; /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { if (tbl[0].procname != ipv4_route_flush_procname) table_size = 0; } /* Update the variables to point into the current struct net * except for the first element flush */ for (i = 1; i < table_size; i++) tbl[i].data += (void *)net - (void *)&init_net; } tbl[0].extra1 = net; net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route", tbl, table_size); if (!net->ipv4.route_hdr) goto err_reg; return 0; err_reg: if (tbl != ipv4_route_netns_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_route_net_exit(struct net *net) { const struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); BUG_ON(tbl == ipv4_route_netns_table); kfree(tbl); } static __net_initdata struct pernet_operations sysctl_route_ops = { .init = sysctl_route_net_init, .exit = sysctl_route_net_exit, }; #endif static __net_init int netns_ip_rt_init(struct net *net) { /* Set default value for namespaceified sysctls */ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU; net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES; net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS; return 0; } static struct pernet_operations __net_initdata ip_rt_ops = { .init = netns_ip_rt_init, }; static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); atomic_set(&net->ipv4.dev_addr_genid, get_random_u32()); return 0; } static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, }; static int __net_init ipv4_inetpeer_init(struct net *net) { struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; inet_peer_base_init(bp); net->ipv4.peers = bp; return 0; } static void __net_exit ipv4_inetpeer_exit(struct net *net) { struct inet_peer_base *bp = net->ipv4.peers; net->ipv4.peers = NULL; inetpeer_invalidate_tree(bp); kfree(bp); } static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { .init = ipv4_inetpeer_init, .exit = ipv4_inetpeer_exit, }; #ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip_rt_init(void) { void *idents_hash; int cpu; /* For modern hosts, this will use 2 MB of memory */ idents_hash = alloc_large_system_hash("IP idents", sizeof(*ip_idents) + sizeof(*ip_tstamps), 0, 16, /* one bucket per 64 KB */ HASH_ZERO, NULL, &ip_idents_mask, 2048, 256*1024); ip_idents = idents_hash; get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); } #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable, SLAB_HWCACHE_ALIGN | SLAB_PANIC); ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; if (dst_entries_init(&ipv4_dst_ops) < 0) panic("IP: failed to allocate ipv4_dst_ops counter\n"); if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); ipv4_dst_ops.gc_thresh = ~0; ip_rt_max_size = INT_MAX; devinet_init(); ip_fib_init(); if (ip_rt_proc_init()) pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(); #endif rtnl_register_many(ip_rt_rtnl_msg_handlers); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif register_pernet_subsys(&ip_rt_ops); register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); return 0; } #ifdef CONFIG_SYSCTL /* * We really need to sanitize the damn ipv4 init order, then all * this nonsense will go away. */ void __init ip_static_sysctl_init(void) { register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); } #endif
5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2012 Smith Micro Software, Inc. * Copyright (c) 2012 Bjørn Mork <bjorn@mork.no> * * This driver is based on and reuse most of cdc_ncm, which is * Copyright (C) ST-Ericsson 2010-2012 */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/ethtool.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <linux/usb/usbnet.h> #include <linux/usb/cdc-wdm.h> #include <linux/usb/cdc_ncm.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/ipv6_stubs.h> #include <net/ndisc.h> /* alternative VLAN for IP session 0 if not untagged */ #define MBIM_IPS0_VID 4094 /* driver specific data - must match cdc_ncm usage */ struct cdc_mbim_state { struct cdc_ncm_ctx *ctx; atomic_t pmcount; struct usb_driver *subdriver; unsigned long _unused; unsigned long flags; }; /* flags for the cdc_mbim_state.flags field */ enum cdc_mbim_flags { FLAG_IPS0_VLAN = 1 << 0, /* IP session 0 is tagged */ }; /* using a counter to merge subdriver requests with our own into a combined state */ static int cdc_mbim_manage_power(struct usbnet *dev, int on) { struct cdc_mbim_state *info = (void *)&dev->data; int rv = 0; dev_dbg(&dev->intf->dev, "%s() pmcount=%d, on=%d\n", __func__, atomic_read(&info->pmcount), on); if ((on && atomic_add_return(1, &info->pmcount) == 1) || (!on && atomic_dec_and_test(&info->pmcount))) { /* need autopm_get/put here to ensure the usbcore sees the new value */ rv = usb_autopm_get_interface(dev->intf); dev->intf->needs_remote_wakeup = on; if (!rv) usb_autopm_put_interface(dev->intf); } return 0; } static int cdc_mbim_wdm_manage_power(struct usb_interface *intf, int status) { struct usbnet *dev = usb_get_intfdata(intf); /* can be called while disconnecting */ if (!dev) return 0; return cdc_mbim_manage_power(dev, status); } static int cdc_mbim_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid) { struct usbnet *dev = netdev_priv(netdev); struct cdc_mbim_state *info = (void *)&dev->data; /* creation of this VLAN is a request to tag IP session 0 */ if (vid == MBIM_IPS0_VID) info->flags |= FLAG_IPS0_VLAN; else if (vid >= 512) /* we don't map these to MBIM session */ return -EINVAL; return 0; } static int cdc_mbim_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid) { struct usbnet *dev = netdev_priv(netdev); struct cdc_mbim_state *info = (void *)&dev->data; /* this is a request for an untagged IP session 0 */ if (vid == MBIM_IPS0_VID) info->flags &= ~FLAG_IPS0_VLAN; return 0; } static const struct net_device_ops cdc_mbim_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_get_stats64 = dev_get_tstats64, .ndo_change_mtu = cdc_ncm_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = cdc_mbim_rx_add_vid, .ndo_vlan_rx_kill_vid = cdc_mbim_rx_kill_vid, }; /* Change the control interface altsetting and update the .driver_info * pointer if the matching entry after changing class codes points to * a different struct */ static int cdc_mbim_set_ctrlalt(struct usbnet *dev, struct usb_interface *intf, u8 alt) { struct usb_driver *driver = to_usb_driver(intf->dev.driver); const struct usb_device_id *id; struct driver_info *info; int ret; ret = usb_set_interface(dev->udev, intf->cur_altsetting->desc.bInterfaceNumber, alt); if (ret) return ret; id = usb_match_id(intf, driver->id_table); if (!id) return -ENODEV; info = (struct driver_info *)id->driver_info; if (info != dev->driver_info) { dev_dbg(&intf->dev, "driver_info updated to '%s'\n", info->description); dev->driver_info = info; } return 0; } static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf) { struct cdc_ncm_ctx *ctx; struct usb_driver *subdriver = ERR_PTR(-ENODEV); int ret = -ENODEV; u8 data_altsetting = 1; struct cdc_mbim_state *info = (void *)&dev->data; /* should we change control altsetting on a NCM/MBIM function? */ if (cdc_ncm_select_altsetting(intf) == CDC_NCM_COMM_ALTSETTING_MBIM) { data_altsetting = CDC_NCM_DATA_ALTSETTING_MBIM; ret = cdc_mbim_set_ctrlalt(dev, intf, CDC_NCM_COMM_ALTSETTING_MBIM); if (ret) goto err; ret = -ENODEV; } /* we will hit this for NCM/MBIM functions if prefer_mbim is false */ if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting)) goto err; ret = cdc_ncm_bind_common(dev, intf, data_altsetting, dev->driver_info->data); if (ret) goto err; ctx = info->ctx; /* The MBIM descriptor and the status endpoint are required */ if (ctx->mbim_desc && dev->status) subdriver = usb_cdc_wdm_register(ctx->control, &dev->status->desc, le16_to_cpu(ctx->mbim_desc->wMaxControlMessage), WWAN_PORT_MBIM, cdc_mbim_wdm_manage_power); if (IS_ERR(subdriver)) { ret = PTR_ERR(subdriver); cdc_ncm_unbind(dev, intf); goto err; } /* can't let usbnet use the interrupt endpoint */ dev->status = NULL; info->subdriver = subdriver; /* MBIM cannot do ARP */ dev->net->flags |= IFF_NOARP; /* no need to put the VLAN tci in the packet headers */ dev->net->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_FILTER; /* monitor VLAN additions and removals */ dev->net->netdev_ops = &cdc_mbim_netdev_ops; err: return ret; } static void cdc_mbim_unbind(struct usbnet *dev, struct usb_interface *intf) { struct cdc_mbim_state *info = (void *)&dev->data; struct cdc_ncm_ctx *ctx = info->ctx; /* disconnect subdriver from control interface */ if (info->subdriver && info->subdriver->disconnect) info->subdriver->disconnect(ctx->control); info->subdriver = NULL; /* let NCM unbind clean up both control and data interface */ cdc_ncm_unbind(dev, intf); } /* verify that the ethernet protocol is IPv4 or IPv6 */ static bool is_ip_proto(__be16 proto) { switch (proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): return true; } return false; } static struct sk_buff *cdc_mbim_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { struct sk_buff *skb_out; struct cdc_mbim_state *info = (void *)&dev->data; struct cdc_ncm_ctx *ctx = info->ctx; __le32 sign = cpu_to_le32(USB_CDC_MBIM_NDP16_IPS_SIGN); u16 tci = 0; bool is_ip; u8 *c; if (!ctx) goto error; if (skb) { if (skb->len <= ETH_HLEN) goto error; /* Some applications using e.g. packet sockets will * bypass the VLAN acceleration and create tagged * ethernet frames directly. We primarily look for * the accelerated out-of-band tag, but fall back if * required */ skb_reset_mac_header(skb); if (vlan_get_tag(skb, &tci) < 0 && skb->len > VLAN_ETH_HLEN && __vlan_get_tag(skb, &tci) == 0) { is_ip = is_ip_proto(vlan_eth_hdr(skb)->h_vlan_encapsulated_proto); skb_pull(skb, VLAN_ETH_HLEN); } else { is_ip = is_ip_proto(eth_hdr(skb)->h_proto); skb_pull(skb, ETH_HLEN); } /* Is IP session <0> tagged too? */ if (info->flags & FLAG_IPS0_VLAN) { /* drop all untagged packets */ if (!tci) goto error; /* map MBIM_IPS0_VID to IPS<0> */ if (tci == MBIM_IPS0_VID) tci = 0; } /* mapping VLANs to MBIM sessions: * no tag => IPS session <0> if !FLAG_IPS0_VLAN * 1 - 255 => IPS session <vlanid> * 256 - 511 => DSS session <vlanid - 256> * 512 - 4093 => unsupported, drop * 4094 => IPS session <0> if FLAG_IPS0_VLAN */ switch (tci & 0x0f00) { case 0x0000: /* VLAN ID 0 - 255 */ if (!is_ip) goto error; c = (u8 *)&sign; c[3] = tci; break; case 0x0100: /* VLAN ID 256 - 511 */ if (is_ip) goto error; sign = cpu_to_le32(USB_CDC_MBIM_NDP16_DSS_SIGN); c = (u8 *)&sign; c[3] = tci; break; default: netif_err(dev, tx_err, dev->net, "unsupported tci=0x%04x\n", tci); goto error; } } spin_lock_bh(&ctx->mtx); skb_out = cdc_ncm_fill_tx_frame(dev, skb, sign); spin_unlock_bh(&ctx->mtx); return skb_out; error: if (skb) dev_kfree_skb_any(skb); return NULL; } /* Some devices are known to send Neighbor Solicitation messages and * require Neighbor Advertisement replies. The IPv6 core will not * respond since IFF_NOARP is set, so we must handle them ourselves. */ static void do_neigh_solicit(struct usbnet *dev, u8 *buf, u16 tci) { struct ipv6hdr *iph = (void *)buf; struct nd_msg *msg = (void *)(iph + 1); struct net_device *netdev; struct inet6_dev *in6_dev; bool is_router; /* we'll only respond to requests from unicast addresses to * our solicited node addresses. */ if (!ipv6_addr_is_solict_mult(&iph->daddr) || !(ipv6_addr_type(&iph->saddr) & IPV6_ADDR_UNICAST)) return; /* need to send the NA on the VLAN dev, if any */ rcu_read_lock(); if (tci) { netdev = __vlan_find_dev_deep_rcu(dev->net, htons(ETH_P_8021Q), tci); if (!netdev) { rcu_read_unlock(); return; } } else { netdev = dev->net; } dev_hold(netdev); rcu_read_unlock(); in6_dev = in6_dev_get(netdev); if (!in6_dev) goto out; is_router = !!READ_ONCE(in6_dev->cnf.forwarding); in6_dev_put(in6_dev); /* ipv6_stub != NULL if in6_dev_get returned an inet6_dev */ ipv6_stub->ndisc_send_na(netdev, &iph->saddr, &msg->target, is_router /* router */, true /* solicited */, false /* override */, true /* inc_opt */); out: dev_put(netdev); } static bool is_neigh_solicit(u8 *buf, size_t len) { struct ipv6hdr *iph = (void *)buf; struct nd_msg *msg = (void *)(iph + 1); return (len >= sizeof(struct ipv6hdr) + sizeof(struct nd_msg) && iph->nexthdr == IPPROTO_ICMPV6 && msg->icmph.icmp6_code == 0 && msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION); } static struct sk_buff *cdc_mbim_process_dgram(struct usbnet *dev, u8 *buf, size_t len, u16 tci) { __be16 proto = htons(ETH_P_802_3); struct sk_buff *skb = NULL; if (tci < 256 || tci == MBIM_IPS0_VID) { /* IPS session? */ if (len < sizeof(struct iphdr)) goto err; switch (*buf & 0xf0) { case 0x40: proto = htons(ETH_P_IP); break; case 0x60: if (is_neigh_solicit(buf, len)) do_neigh_solicit(dev, buf, tci); proto = htons(ETH_P_IPV6); break; default: goto err; } } skb = netdev_alloc_skb_ip_align(dev->net, len + ETH_HLEN); if (!skb) goto err; /* add an ethernet header */ skb_put(skb, ETH_HLEN); skb_reset_mac_header(skb); eth_hdr(skb)->h_proto = proto; eth_zero_addr(eth_hdr(skb)->h_source); memcpy(eth_hdr(skb)->h_dest, dev->net->dev_addr, ETH_ALEN); /* add datagram */ skb_put_data(skb, buf, len); /* map MBIM session to VLAN */ if (tci) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), tci); err: return skb; } static int cdc_mbim_rx_fixup(struct usbnet *dev, struct sk_buff *skb_in) { struct sk_buff *skb; struct cdc_mbim_state *info = (void *)&dev->data; struct cdc_ncm_ctx *ctx = info->ctx; int len; int nframes; int x; int offset; struct usb_cdc_ncm_ndp16 *ndp16; struct usb_cdc_ncm_dpe16 *dpe16; int ndpoffset; int loopcount = 50; /* arbitrary max preventing infinite loop */ u32 payload = 0; u8 *c; u16 tci; ndpoffset = cdc_ncm_rx_verify_nth16(ctx, skb_in); if (ndpoffset < 0) goto error; next_ndp: nframes = cdc_ncm_rx_verify_ndp16(skb_in, ndpoffset); if (nframes < 0) goto error; ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb_in->data + ndpoffset); switch (ndp16->dwSignature & cpu_to_le32(0x00ffffff)) { case cpu_to_le32(USB_CDC_MBIM_NDP16_IPS_SIGN): c = (u8 *)&ndp16->dwSignature; tci = c[3]; /* tag IPS<0> packets too if MBIM_IPS0_VID exists */ if (!tci && info->flags & FLAG_IPS0_VLAN) tci = MBIM_IPS0_VID; break; case cpu_to_le32(USB_CDC_MBIM_NDP16_DSS_SIGN): c = (u8 *)&ndp16->dwSignature; tci = c[3] + 256; break; default: netif_dbg(dev, rx_err, dev->net, "unsupported NDP signature <0x%08x>\n", le32_to_cpu(ndp16->dwSignature)); goto err_ndp; } dpe16 = ndp16->dpe16; for (x = 0; x < nframes; x++, dpe16++) { offset = le16_to_cpu(dpe16->wDatagramIndex); len = le16_to_cpu(dpe16->wDatagramLength); /* * CDC NCM ch. 3.7 * All entries after first NULL entry are to be ignored */ if ((offset == 0) || (len == 0)) { if (!x) goto err_ndp; /* empty NTB */ break; } /* sanity checking */ if (((offset + len) > skb_in->len) || (len > ctx->rx_max)) { netif_dbg(dev, rx_err, dev->net, "invalid frame detected (ignored) offset[%u]=%u, length=%u, skb=%p\n", x, offset, len, skb_in); if (!x) goto err_ndp; break; } else { skb = cdc_mbim_process_dgram(dev, skb_in->data + offset, len, tci); if (!skb) goto error; usbnet_skb_return(dev, skb); payload += len; /* count payload bytes in this NTB */ } } err_ndp: /* are there more NDPs to process? */ ndpoffset = le16_to_cpu(ndp16->wNextNdpIndex); if (ndpoffset && loopcount--) goto next_ndp; /* update stats */ ctx->rx_overhead += skb_in->len - payload; ctx->rx_ntbs++; return 1; error: return 0; } static int cdc_mbim_suspend(struct usb_interface *intf, pm_message_t message) { int ret = -ENODEV; struct usbnet *dev = usb_get_intfdata(intf); struct cdc_mbim_state *info = (void *)&dev->data; struct cdc_ncm_ctx *ctx = info->ctx; if (!ctx) goto error; /* * Both usbnet_suspend() and subdriver->suspend() MUST return 0 * in system sleep context, otherwise, the resume callback has * to recover device from previous suspend failure. */ ret = usbnet_suspend(intf, message); if (ret < 0) goto error; if (intf == ctx->control && info->subdriver && info->subdriver->suspend) ret = info->subdriver->suspend(intf, message); if (ret < 0) usbnet_resume(intf); error: return ret; } static int cdc_mbim_resume(struct usb_interface *intf) { int ret = 0; struct usbnet *dev = usb_get_intfdata(intf); struct cdc_mbim_state *info = (void *)&dev->data; struct cdc_ncm_ctx *ctx = info->ctx; bool callsub = (intf == ctx->control && info->subdriver && info->subdriver->resume); if (callsub) ret = info->subdriver->resume(intf); if (ret < 0) goto err; ret = usbnet_resume(intf); if (ret < 0 && callsub) info->subdriver->suspend(intf, PMSG_SUSPEND); err: return ret; } static const struct driver_info cdc_mbim_info = { .description = "CDC MBIM", .flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN, .bind = cdc_mbim_bind, .unbind = cdc_mbim_unbind, .manage_power = cdc_mbim_manage_power, .rx_fixup = cdc_mbim_rx_fixup, .tx_fixup = cdc_mbim_tx_fixup, }; /* MBIM and NCM devices should not need a ZLP after NTBs with * dwNtbOutMaxSize length. Nevertheless, a number of devices from * different vendor IDs will fail unless we send ZLPs, forcing us * to make this the default. * * This default may cause a performance penalty for spec conforming * devices wanting to take advantage of optimizations possible without * ZLPs. A whitelist is added in an attempt to avoid this for devices * known to conform to the MBIM specification. * * All known devices supporting NCM compatibility mode are also * conforming to the NCM and MBIM specifications. For this reason, the * NCM subclass entry is also in the ZLP whitelist. */ static const struct driver_info cdc_mbim_info_zlp = { .description = "CDC MBIM", .flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN | FLAG_SEND_ZLP, .bind = cdc_mbim_bind, .unbind = cdc_mbim_unbind, .manage_power = cdc_mbim_manage_power, .rx_fixup = cdc_mbim_rx_fixup, .tx_fixup = cdc_mbim_tx_fixup, }; /* The spefication explicitly allows NDPs to be placed anywhere in the * frame, but some devices fail unless the NDP is placed after the IP * packets. Using the CDC_NCM_FLAG_NDP_TO_END flags to force this * behaviour. * * Note: The current implementation of this feature restricts each NTB * to a single NDP, implying that multiplexed sessions cannot share an * NTB. This might affect performance for multiplexed sessions. */ static const struct driver_info cdc_mbim_info_ndp_to_end = { .description = "CDC MBIM", .flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN, .bind = cdc_mbim_bind, .unbind = cdc_mbim_unbind, .manage_power = cdc_mbim_manage_power, .rx_fixup = cdc_mbim_rx_fixup, .tx_fixup = cdc_mbim_tx_fixup, .data = CDC_NCM_FLAG_NDP_TO_END, }; /* Some modems (e.g. Telit LE922A6) do not work properly with altsetting * toggle done in cdc_ncm_bind_common. CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE * flag is used to avoid this procedure. */ static const struct driver_info cdc_mbim_info_avoid_altsetting_toggle = { .description = "CDC MBIM", .flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN | FLAG_SEND_ZLP, .bind = cdc_mbim_bind, .unbind = cdc_mbim_unbind, .manage_power = cdc_mbim_manage_power, .rx_fixup = cdc_mbim_rx_fixup, .tx_fixup = cdc_mbim_tx_fixup, .data = CDC_MBIM_FLAG_AVOID_ALTSETTING_TOGGLE, }; static const struct usb_device_id mbim_devs[] = { /* This duplicate NCM entry is intentional. MBIM devices can * be disguised as NCM by default, and this is necessary to * allow us to bind the correct driver_info to such devices. * * bind() will sort out this for us, selecting the correct * entry and reject the other */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info, }, /* ZLP conformance whitelist: All Ericsson MBIM devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bdb, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info, }, /* Some Huawei devices, ME906s-158 (12d1:15c1) and E3372 * (12d1:157d), are known to fail unless the NDP is placed * after the IP packets. Applying the quirk to all Huawei * devices is broader than necessary, but harmless. */ { USB_VENDOR_AND_INTERFACE_INFO(0x12d1, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_ndp_to_end, }, /* The HP lt4132 (03f0:a31d) is a rebranded Huawei ME906s-158, * therefore it too requires the above "NDP to end" quirk. */ { USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0xa31d, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_ndp_to_end, }, /* Telit LE922A6 in MBIM composition */ { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1041, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, }, /* Telit LN920 */ { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1061, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, }, /* Telit FN990A */ { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1071, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, }, /* Telit FE990A */ { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1081, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, }, /* default entry */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_zlp, }, { }, }; MODULE_DEVICE_TABLE(usb, mbim_devs); static struct usb_driver cdc_mbim_driver = { .name = "cdc_mbim", .id_table = mbim_devs, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = cdc_mbim_suspend, .resume = cdc_mbim_resume, .reset_resume = cdc_mbim_resume, .supports_autosuspend = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(cdc_mbim_driver); MODULE_AUTHOR("Greg Suarez <gsuarez@smithmicro.com>"); MODULE_AUTHOR("Bjørn Mork <bjorn@mork.no>"); MODULE_DESCRIPTION("USB CDC MBIM host driver"); MODULE_LICENSE("GPL");
1 1 1 1 1 1 1 1 1 1 1 1 490 490 46 37 46 37 489 172 174 175 175 422 421 490 491 490 481 11 32 33 173 175 33 33 32 32 19 19 19 19 17 22 22 21 33 22 12 15 15 15 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/char_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/init.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/kobject.h> #include <linux/kobj_map.h> #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/tty.h> #include "internal.h" static struct kobj_map *cdev_map __ro_after_init; static DEFINE_MUTEX(chrdevs_lock); #define CHRDEV_MAJOR_HASH_SIZE 255 static struct char_device_struct { struct char_device_struct *next; unsigned int major; unsigned int baseminor; int minorct; char name[64]; struct cdev *cdev; /* will die */ } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; mutex_lock(&chrdevs_lock); for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); } mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ static int find_dynamic_major(void) { int i; struct char_device_struct *cd; for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) { if (chrdevs[i] == NULL) return i; } for (i = CHRDEV_MAJOR_DYN_EXT_START; i >= CHRDEV_MAJOR_DYN_EXT_END; i--) { for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) if (cd->major == i) break; if (cd == NULL) return i; } return -EBUSY; } /* * Register a single major with a specified minor range. * * If major == 0 this function will dynamically allocate an unused major. * If major > 0 this function will attempt to reserve the range of minors * with given major. * */ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { struct char_device_struct *cd, *curr, *prev = NULL; int ret; int i; if (major >= CHRDEV_MAJOR_MAX) { pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", name, major, CHRDEV_MAJOR_MAX-1); return ERR_PTR(-EINVAL); } if (minorct > MINORMASK + 1 - baseminor) { pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n", name, baseminor, baseminor + minorct - 1, 0, MINORMASK); return ERR_PTR(-EINVAL); } cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); mutex_lock(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); goto out; } major = ret; } ret = -EBUSY; i = major_to_index(major); for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) { if (curr->major < major) continue; if (curr->major > major) break; if (curr->baseminor + curr->minorct <= baseminor) continue; if (curr->baseminor >= baseminor + minorct) break; goto out; } cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; strscpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; chrdevs[i] = cd; } else { cd->next = prev->next; prev->next = cd; } mutex_unlock(&chrdevs_lock); return cd; out: mutex_unlock(&chrdevs_lock); kfree(cd); return ERR_PTR(ret); } static struct char_device_struct * __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) { struct char_device_struct *cd = NULL, **cp; int i = major_to_index(major); mutex_lock(&chrdevs_lock); for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major == major && (*cp)->baseminor == baseminor && (*cp)->minorct == minorct) break; if (*cp) { cd = *cp; *cp = cd->next; } mutex_unlock(&chrdevs_lock); return cd; } /** * register_chrdev_region() - register a range of device numbers * @from: the first in the desired range of device numbers; must include * the major number. * @count: the number of consecutive device numbers required * @name: the name of the device or driver. * * Return value is zero on success, a negative error code on failure. */ int register_chrdev_region(dev_t from, unsigned count, const char *name) { struct char_device_struct *cd; dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; cd = __register_chrdev_region(MAJOR(n), MINOR(n), next - n, name); if (IS_ERR(cd)) goto fail; } return 0; fail: to = n; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } return PTR_ERR(cd); } /** * alloc_chrdev_region() - register a range of char device numbers * @dev: output parameter for first assigned number * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: the name of the associated device or driver * * Allocates a range of char device numbers. The major number will be * chosen dynamically, and returned (along with the first minor number) * in @dev. Returns zero or a negative error code. */ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { struct char_device_struct *cd; cd = __register_chrdev_region(0, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); *dev = MKDEV(cd->major, cd->baseminor); return 0; } /** * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * * If @major == 0 this functions will dynamically allocate a major and return * its number. * * If @major > 0 this function will attempt to reserve a device with the given * major number and will return zero on success. * * Returns a -ve errno on failure. * * The name of this device has nothing to do with the name of the device in * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. */ int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; int err = -ENOMEM; cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); cdev = cdev_alloc(); if (!cdev) goto out2; cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; cd->cdev = cdev; return major ? 0 : cd->major; out: kobject_put(&cdev->kobj); out2: kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } /** * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * * This function will unregister a range of @count device numbers, * starting with @from. The caller should normally be the one who * allocated those numbers in the first place... */ void unregister_chrdev_region(dev_t from, unsigned count) { dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } } /** * __unregister_chrdev - unregister and destroy a cdev * @major: major device number * @baseminor: first of the range of minor numbers * @count: the number of minor numbers this cdev is occupying * @name: name of this range of devices * * Unregister and destroy the cdev occupying the region described by * @major, @baseminor and @count. This function undoes what * __register_chrdev() did. */ void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct char_device_struct *cd; cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); } static DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { struct module *owner = p->owner; struct kobject *kobj; if (!try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; } void cdev_put(struct cdev *p) { if (p) { struct module *owner = p->owner; kobject_put(&p->kobj); module_put(owner); } } /* * Called every time a character special file is opened */ static int chrdev_open(struct inode *inode, struct file *filp) { const struct file_operations *fops; struct cdev *p; struct cdev *new = NULL; int ret = 0; spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { struct kobject *kobj; int idx; spin_unlock(&cdev_lock); kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); if (!kobj) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); /* Check i_cdev again in case somebody beat us to it while we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) ret = -ENXIO; } else if (!cdev_get(p)) ret = -ENXIO; spin_unlock(&cdev_lock); cdev_put(new); if (ret) return ret; ret = -ENXIO; fops = fops_get(p->ops); if (!fops) goto out_cdev_put; replace_fops(filp, fops); if (filp->f_op->open) { ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } return 0; out_cdev_put: cdev_put(p); return ret; } void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } static void cdev_purge(struct cdev *cdev) { spin_lock(&cdev_lock); while (!list_empty(&cdev->list)) { struct inode *inode; inode = container_of(cdev->list.next, struct inode, i_devices); list_del_init(&inode->i_devices); inode->i_cdev = NULL; } spin_unlock(&cdev_lock); } /* * Dummy default file-operations: the only thing this does * is contain the open that then fills in the correct operations * depending on the special file... */ const struct file_operations def_chr_fops = { .open = chrdev_open, .llseek = noop_llseek, }; static struct kobject *exact_match(dev_t dev, int *part, void *data) { struct cdev *p = data; return &p->kobj; } static int exact_lock(dev_t dev, void *data) { struct cdev *p = data; return cdev_get(p) ? 0 : -1; } /** * cdev_add() - add a char device to the system * @p: the cdev structure for the device * @dev: the first device number for which this device is responsible * @count: the number of consecutive minor numbers corresponding to this * device * * cdev_add() adds the device represented by @p to the system, making it * live immediately. A negative error code is returned on failure. */ int cdev_add(struct cdev *p, dev_t dev, unsigned count) { int error; p->dev = dev; p->count = count; if (WARN_ON(dev == WHITEOUT_DEV)) { error = -EBUSY; goto err; } error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) goto err; kobject_get(p->kobj.parent); return 0; err: kfree_const(p->kobj.name); p->kobj.name = NULL; return error; } /** * cdev_set_parent() - set the parent kobject for a char device * @p: the cdev structure * @kobj: the kobject to take a reference to * * cdev_set_parent() sets a parent kobject which will be referenced * appropriately so the parent is not freed before the cdev. This * should be called before cdev_add. */ void cdev_set_parent(struct cdev *p, struct kobject *kobj) { WARN_ON(!kobj->state_initialized); p->kobj.parent = kobj; } /** * cdev_device_add() - add a char device and it's corresponding * struct device, linkink * @dev: the device structure * @cdev: the cdev structure * * cdev_device_add() adds the char device represented by @cdev to the system, * just as cdev_add does. It then adds @dev to the system using device_add * The dev_t for the char device will be taken from the struct device which * needs to be initialized first. This helper function correctly takes a * reference to the parent device so the parent will not get released until * all references to the cdev are released. * * This helper uses dev->devt for the device number. If it is not set * it will not add the cdev and it will be equivalent to device_add. * * This function should be used whenever the struct cdev and the * struct device are members of the same structure whose lifetime is * managed by the struct device. * * NOTE: Callers must assume that userspace was able to open the cdev and * can call cdev fops callbacks at any time, even if this function fails. */ int cdev_device_add(struct cdev *cdev, struct device *dev) { int rc = 0; if (dev->devt) { cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) return rc; } rc = device_add(dev); if (rc && dev->devt) cdev_del(cdev); return rc; } /** * cdev_device_del() - inverse of cdev_device_add * @cdev: the cdev structure * @dev: the device structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. * * If dev->devt is not set it will not remove the cdev and will be equivalent * to device_del. * * NOTE: This guarantees that associated sysfs callbacks are not running * or runnable, however any cdevs already open will remain and their fops * will still be callable even after this function returns. */ void cdev_device_del(struct cdev *cdev, struct device *dev) { device_del(dev); if (dev->devt) cdev_del(cdev); } static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); } /** * cdev_del() - remove a cdev from the system * @p: the cdev structure to be removed * * cdev_del() removes @p from the system, possibly freeing the structure * itself. * * NOTE: This guarantees that cdev device will no longer be able to be * opened, however any cdevs already open will remain and their fops will * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { cdev_unmap(p->dev, p->count); kobject_put(&p->kobj); } static void cdev_default_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kobject_put(parent); } static void cdev_dynamic_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kfree(p); kobject_put(parent); } static struct kobj_type ktype_cdev_default = { .release = cdev_default_release, }; static struct kobj_type ktype_cdev_dynamic = { .release = cdev_dynamic_release, }; /** * cdev_alloc() - allocate a cdev structure * * Allocates and returns a cdev structure, or NULL on failure. */ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); if (p) { INIT_LIST_HEAD(&p->list); kobject_init(&p->kobj, &ktype_cdev_dynamic); } return p; } /** * cdev_init() - initialize a cdev structure * @cdev: the structure to initialize * @fops: the file_operations for this device * * Initializes @cdev, remembering @fops, making it ready to add to the * system with cdev_add(). */ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); INIT_LIST_HEAD(&cdev->list); kobject_init(&cdev->kobj, &ktype_cdev_default); cdev->ops = fops; } static struct kobject *base_probe(dev_t dev, int *part, void *data) { if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) /* Make old-style 2.4 aliases work */ request_module("char-major-%d", MAJOR(dev)); return NULL; } void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); } /* Let modules do char dev stuff */ EXPORT_SYMBOL(register_chrdev_region); EXPORT_SYMBOL(unregister_chrdev_region); EXPORT_SYMBOL(alloc_chrdev_region); EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_set_parent); EXPORT_SYMBOL(cdev_device_add); EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev);
60 60 58 60 60 60 60 59 60 60 50 49 48 50 50 60 60 58 59 58 58 59 60 60 1 1 7 6 1 6 1 5 1 4 1 4 4 2 1 2 2 2 2 1 2 2 2 5 1 1 1 1 1 1 4 4 3 3 1 1 2 2 2 1 1 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: policy rules. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Thomas Graf <tgraf@suug.ch> * * Fixes: * Rani Assaf : local_rule cannot be deleted * Marc Boucher : routing by fwmark */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/inetdevice.h> #include <linux/init.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <net/flow.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/route.h> #include <net/tcp.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/fib_rules.h> #include <linux/indirect_call_wrapper.h> struct fib4_rule { struct fib_rule common; u8 dst_len; u8 src_len; dscp_t dscp; dscp_t dscp_mask; u8 dscp_full:1; /* DSCP or TOS selector */ __be32 src; __be32 srcmask; __be32 dst; __be32 dstmask; #ifdef CONFIG_IP_ROUTE_CLASSID u32 tclassid; #endif }; static bool fib4_rule_matchall(const struct fib_rule *rule) { struct fib4_rule *r = container_of(rule, struct fib4_rule, common); if (r->dst_len || r->src_len || r->dscp) return false; return fib_rule_matchall(rule); } bool fib4_rule_default(const struct fib_rule *rule) { if (!fib4_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL || rule->l3mdev) return false; if (rule->table != RT_TABLE_LOCAL && rule->table != RT_TABLE_MAIN && rule->table != RT_TABLE_DEFAULT) return false; return true; } EXPORT_SYMBOL_GPL(fib4_rule_default); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, AF_INET, extack); } unsigned int fib4_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, AF_INET); } int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_lookup_arg arg = { .result = res, .flags = flags, }; int err; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi4_to_flowi(flp)); err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); #ifdef CONFIG_IP_ROUTE_CLASSID if (arg.rule) res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid; else res->tclassid = 0; #endif if (err == -ESRCH) err = -ENETUNREACH; return err; } EXPORT_SYMBOL_GPL(__fib_lookup); INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { int err = -EAGAIN; struct fib_table *tbl; u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } rcu_read_lock(); tb_id = fib_rule_get_table(rule, arg); tbl = fib_get_table(rule->fr_net, tb_id); if (tbl) err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *)arg->result, arg->flags); rcu_read_unlock(); return err; } INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg) { struct fib_result *result = arg->result; struct net_device *dev = NULL; if (result->fi) { struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0); dev = nhc->nhc_dev; } /* do not accept result if the route does * not meet the required prefix length */ if (result->prefixlen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device * belonging to a forbidden interface group */ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) goto suppress_route; return false; suppress_route: if (!(arg->flags & FIB_LOOKUP_NOREF)) fib_info_put(result->fi); return true; } INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; struct flowi4 *fl4 = &fl->u.ip4; __be32 daddr = fl4->daddr; __be32 saddr = fl4->saddr; if (((saddr ^ r->src) & r->srcmask) || ((daddr ^ r->dst) & r->dstmask)) return 0; /* When DSCP selector is used we need to match on the entire DSCP field * in the flow information structure. When TOS selector is used we need * to mask the upper three DSCP bits prior to matching to maintain * legacy behavior. */ if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask) return 0; else if (!r->dscp_full && r->dscp && !fib_dscp_masked_match(r->dscp, fl4)) return 0; if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) return 0; if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, fl4->fl4_sport)) return 0; if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, fl4->fl4_dport)) return 0; return 1; } static struct fib_table *fib_empty_table(struct net *net) { u32 id = 1; while (1) { if (!fib_get_table(net, id)) return fib_new_table(net, id); if (id++ == RT_TABLE_MAX) break; } return NULL; } static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4, struct netlink_ext_ack *extack) { if (rule4->dscp) { NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP"); return -EINVAL; } rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); rule4->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK); rule4->dscp_full = true; return 0; } static int fib4_nl2rule_dscp_mask(const struct nlattr *nla, struct fib4_rule *rule4, struct netlink_ext_ack *extack) { dscp_t dscp_mask; if (!rule4->dscp_full) { NL_SET_ERR_MSG_ATTR(extack, nla, "Cannot specify DSCP mask without DSCP value"); return -EINVAL; } dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); if (rule4->dscp & ~dscp_mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask"); return -EINVAL; } rule4->dscp_mask = dscp_mask; return 0; } static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct fib4_rule *rule4 = (struct fib4_rule *)rule; struct net *net = rule->fr_net; int err = -EINVAL; if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) { NL_SET_ERR_MSG(extack, "Flow label cannot be specified for IPv4 FIB rules"); goto errout; } if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); goto errout; } /* IPv4 currently doesn't handle high order DSCP bits correctly */ if (frh->tos & ~IPTOS_TOS_MASK) { NL_SET_ERR_MSG(extack, "Invalid tos"); goto errout; } rule4->dscp = inet_dsfield_to_dscp(frh->tos); if (tb[FRA_DSCP] && fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0) goto errout; if (tb[FRA_DSCP_MASK] && fib4_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule4, extack) < 0) goto errout; /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) goto errout; if (rule->table == RT_TABLE_UNSPEC && !rule->l3mdev) { if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; table = fib_empty_table(net); if (!table) { err = -ENOBUFS; goto errout; } rule->table = table->tb_id; } } if (frh->src_len) rule4->src = nla_get_in_addr(tb[FRA_SRC]); if (frh->dst_len) rule4->dst = nla_get_in_addr(tb[FRA_DST]); #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); if (rule4->tclassid) atomic_inc(&net->ipv4.fib_num_tclassid_users); } #endif if (fib_rule_requires_fldissect(rule)) net->ipv4.fib_rules_require_fldissect++; rule4->src_len = frh->src_len; rule4->srcmask = inet_make_mask(rule4->src_len); rule4->dst_len = frh->dst_len; rule4->dstmask = inet_make_mask(rule4->dst_len); net->ipv4.fib_has_custom_rules = true; err = 0; errout: return err; } static int fib4_rule_delete(struct fib_rule *rule) { struct net *net = rule->fr_net; int err; /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) goto errout; #ifdef CONFIG_IP_ROUTE_CLASSID if (((struct fib4_rule *)rule)->tclassid) atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif net->ipv4.fib_has_custom_rules = true; if (net->ipv4.fib_rules_require_fldissect && fib_rule_requires_fldissect(rule)) net->ipv4.fib_rules_require_fldissect--; errout: return err; } static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (frh->src_len && (rule4->src_len != frh->src_len)) return 0; if (frh->dst_len && (rule4->dst_len != frh->dst_len)) return 0; if (frh->tos && (rule4->dscp_full || inet_dscp_to_dsfield(rule4->dscp) != frh->tos)) return 0; if (tb[FRA_DSCP]) { dscp_t dscp; dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2); if (!rule4->dscp_full || rule4->dscp != dscp) return 0; } if (tb[FRA_DSCP_MASK]) { dscp_t dscp_mask; dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2); if (!rule4->dscp_full || rule4->dscp_mask != dscp_mask) return 0; } #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; #endif if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC]))) return 0; if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST]))) return 0; return 1; } static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { struct fib4_rule *rule4 = (struct fib4_rule *) rule; frh->dst_len = rule4->dst_len; frh->src_len = rule4->src_len; if (rule4->dscp_full) { frh->tos = 0; if (nla_put_u8(skb, FRA_DSCP, inet_dscp_to_dsfield(rule4->dscp) >> 2) || nla_put_u8(skb, FRA_DSCP_MASK, inet_dscp_to_dsfield(rule4->dscp_mask) >> 2)) goto nla_put_failure; } else { frh->tos = inet_dscp_to_dsfield(rule4->dscp); } if ((rule4->dst_len && nla_put_in_addr(skb, FRA_DST, rule4->dst)) || (rule4->src_len && nla_put_in_addr(skb, FRA_SRC, rule4->src))) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid && nla_put_u32(skb, FRA_FLOW, rule4->tclassid)) goto nla_put_failure; #endif return 0; nla_put_failure: return -ENOBUFS; } static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(4) /* dst */ + nla_total_size(4) /* src */ + nla_total_size(4) /* flow */ + nla_total_size(1) /* dscp */ + nla_total_size(1); /* dscp mask */ } static void fib4_rule_flush_cache(struct fib_rules_ops *ops) { rt_cache_flush(ops->fro_net); } static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), .action = fib4_rule_action, .suppress = fib4_rule_suppress, .match = fib4_rule_match, .configure = fib4_rule_configure, .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, .nlmsg_payload = fib4_rule_nlmsg_payload, .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, .owner = THIS_MODULE, }; static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL); if (err < 0) return err; err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN); if (err < 0) return err; err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT); if (err < 0) return err; return 0; } int __net_init fib4_rules_init(struct net *net) { int err; struct fib_rules_ops *ops; ops = fib_rules_register(&fib4_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); err = fib_default_rules_init(ops); if (err < 0) goto fail; net->ipv4.rules_ops = ops; net->ipv4.fib_has_custom_rules = false; net->ipv4.fib_rules_require_fldissect = 0; return 0; fail: /* also cleans all rules already added */ fib_rules_unregister(ops); return err; } void __net_exit fib4_rules_exit(struct net *net) { fib_rules_unregister(net->ipv4.rules_ops); }
122 123 123 122 122 123 122 155 155 3 3 3 1 1 1 1 141 2 2 2 2 2 2 2 2 143 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 1 2 2 1 1 1 1 1 1 1 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 // SPDX-License-Identifier: GPL-2.0 /* * Devices PM QoS constraints management * * Copyright (C) 2011 Texas Instruments, Inc. * * This module exposes the interface to kernel space for specifying * per-device PM QoS dependencies. It provides infrastructure for registration * of: * * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. * Watchers register to keep track of the current QoS needs of the system. * Watchers can register a per-device notification callback using the * dev_pm_qos_*_notifier API. The notification chain data is stored in the * per-device constraint data struct. * * Note about the per-device constraint data struct allocation: * . The per-device constraints data struct ptr is stored into the device * dev_pm_info. * . To minimize the data usage by the per-device constraints, the data struct * is only allocated at the first call to dev_pm_qos_add_request. * . The data is later free'd when the device is removed from the system. * . A global mutex protects the constraints users from the data being * allocated and free'd. */ #include <linux/pm_qos.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/mutex.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/err.h> #include <trace/events/power.h> #include "power.h" static DEFINE_MUTEX(dev_pm_qos_mtx); static DEFINE_MUTEX(dev_pm_qos_sysfs_mtx); /** * __dev_pm_qos_flags - Check PM QoS flags for a given device. * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. * * This routine must be called with dev->power.lock held. */ enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask) { struct dev_pm_qos *qos = dev->power.qos; struct pm_qos_flags *pqf; s32 val; lockdep_assert_held(&dev->power.lock); if (IS_ERR_OR_NULL(qos)) return PM_QOS_FLAGS_UNDEFINED; pqf = &qos->flags; if (list_empty(&pqf->list)) return PM_QOS_FLAGS_UNDEFINED; val = pqf->effective_flags & mask; if (val) return (val == mask) ? PM_QOS_FLAGS_ALL : PM_QOS_FLAGS_SOME; return PM_QOS_FLAGS_NONE; } /** * dev_pm_qos_flags - Check PM QoS flags for a given device (locked). * @dev: Device to check the PM QoS flags for. * @mask: Flags to check against. */ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask) { unsigned long irqflags; enum pm_qos_flags_status ret; spin_lock_irqsave(&dev->power.lock, irqflags); ret = __dev_pm_qos_flags(dev, mask); spin_unlock_irqrestore(&dev->power.lock, irqflags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_flags); /** * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device. * @dev: Device to get the PM QoS constraint value for. * * This routine must be called with dev->power.lock held. */ s32 __dev_pm_qos_resume_latency(struct device *dev) { lockdep_assert_held(&dev->power.lock); return dev_pm_qos_raw_resume_latency(dev); } /** * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked). * @dev: Device to get the PM QoS constraint value for. * @type: QoS request type. */ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos *qos = dev->power.qos; unsigned long flags; s32 ret; spin_lock_irqsave(&dev->power.lock, flags); switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT : pm_qos_read_value(&qos->resume_latency); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MIN); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE : freq_qos_read_value(&qos->freq, FREQ_QOS_MAX); break; default: WARN_ON(1); ret = 0; } spin_unlock_irqrestore(&dev->power.lock, flags); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_read_value); /** * apply_constraint - Add/modify/remove device PM QoS request. * @req: Constraint request to apply * @action: Action to perform (add/update/remove). * @value: Value to assign to the QoS request. * * Internal function to update the constraints list using the PM QoS core * code and if needed call the per-device callbacks. */ static int apply_constraint(struct dev_pm_qos_request *req, enum pm_qos_req_action action, s32 value) { struct dev_pm_qos *qos = req->dev->power.qos; int ret; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: if (WARN_ON(action != PM_QOS_REMOVE_REQ && value < 0)) value = 0; ret = pm_qos_update_target(&qos->resume_latency, &req->data.pnode, action, value); break; case DEV_PM_QOS_LATENCY_TOLERANCE: ret = pm_qos_update_target(&qos->latency_tolerance, &req->data.pnode, action, value); if (ret) { value = pm_qos_read_value(&qos->latency_tolerance); req->dev->power.set_latency_tolerance(req->dev, value); } break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_apply(&req->data.freq, action, value); break; case DEV_PM_QOS_FLAGS: ret = pm_qos_update_flags(&qos->flags, &req->data.flr, action, value); break; default: ret = -EINVAL; } return ret; } /* * dev_pm_qos_constraints_allocate * @dev: device to allocate data for * * Called at the first call to add_request, for constraint data allocation * Must be called with the dev_pm_qos_mtx mutex held */ static int dev_pm_qos_constraints_allocate(struct device *dev) { struct dev_pm_qos *qos; struct pm_qos_constraints *c; struct blocking_notifier_head *n; qos = kzalloc(sizeof(*qos), GFP_KERNEL); if (!qos) return -ENOMEM; n = kcalloc(3, sizeof(*n), GFP_KERNEL); if (!n) { kfree(qos); return -ENOMEM; } c = &qos->resume_latency; plist_head_init(&c->list); c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; c->type = PM_QOS_MIN; c->notifiers = n; BLOCKING_INIT_NOTIFIER_HEAD(n); c = &qos->latency_tolerance; plist_head_init(&c->list); c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE; c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; c->type = PM_QOS_MIN; freq_constraints_init(&qos->freq); INIT_LIST_HEAD(&qos->flags.list); spin_lock_irq(&dev->power.lock); dev->power.qos = qos; spin_unlock_irq(&dev->power.lock); return 0; } static void __dev_pm_qos_hide_latency_limit(struct device *dev); static void __dev_pm_qos_hide_flags(struct device *dev); /** * dev_pm_qos_constraints_destroy * @dev: target device * * Called from the device PM subsystem on device removal under device_pm_lock(). */ void dev_pm_qos_constraints_destroy(struct device *dev) { struct dev_pm_qos *qos; struct dev_pm_qos_request *req, *tmp; struct pm_qos_constraints *c; struct pm_qos_flags *f; mutex_lock(&dev_pm_qos_sysfs_mtx); /* * If the device's PM QoS resume latency limit or PM QoS flags have been * exposed to user space, they have to be hidden at this point. */ pm_qos_sysfs_remove_resume_latency(dev); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); __dev_pm_qos_hide_flags(dev); qos = dev->power.qos; if (!qos) goto out; /* Flush the constraints lists for the device. */ c = &qos->resume_latency; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { /* * Update constraints list and call the notification * callbacks if needed */ apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->latency_tolerance; plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.min_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } c = &qos->freq.max_freq; plist_for_each_entry_safe(req, tmp, &c->list, data.freq.pnode) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } f = &qos->flags; list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) { apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); } spin_lock_irq(&dev->power.lock); dev->power.qos = ERR_PTR(-ENODEV); spin_unlock_irq(&dev->power.lock); kfree(qos->resume_latency.notifiers); kfree(qos); out: mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } static bool dev_pm_qos_invalid_req_type(struct device *dev, enum dev_pm_qos_req_type type) { return type == DEV_PM_QOS_LATENCY_TOLERANCE && !dev->power.set_latency_tolerance; } static int __dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret = 0; if (!dev || !req || dev_pm_qos_invalid_req_type(dev, type)) return -EINVAL; if (WARN(dev_pm_qos_request_active(req), "%s() called for already added request\n", __func__)) return -EINVAL; if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); trace_dev_pm_qos_add_request(dev_name(dev), type, value); if (ret) return ret; req->dev = dev; req->type = type; if (req->type == DEV_PM_QOS_MIN_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MIN, value); else if (req->type == DEV_PM_QOS_MAX_FREQUENCY) ret = freq_qos_add_request(&dev->power.qos->freq, &req->data.freq, FREQ_QOS_MAX, value); else ret = apply_constraint(req, PM_QOS_ADD_REQ, value); return ret; } /** * dev_pm_qos_add_request - inserts new qos request into the list * @dev: target device for the constraint * @req: pointer to a preallocated handle * @type: type of the request * @value: defines the qos request * * This function inserts a new entry in the device constraints list of * requested qos performance characteristics. It recomputes the aggregate * QoS expectations of parameters and initializes the dev_pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENOMEM if there's not enough memory * to allocate for data structures, -ENODEV if the device has just been removed * from the system. * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_add_request(dev, req, type, value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_request); /** * __dev_pm_qos_update_request - Modify an existing device PM QoS request. * @req : PM QoS request to modify. * @new_value: New value to request. */ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { s32 curr_value; int ret = 0; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; switch(req->type) { case DEV_PM_QOS_RESUME_LATENCY: case DEV_PM_QOS_LATENCY_TOLERANCE: curr_value = req->data.pnode.prio; break; case DEV_PM_QOS_MIN_FREQUENCY: case DEV_PM_QOS_MAX_FREQUENCY: curr_value = req->data.freq.pnode.prio; break; case DEV_PM_QOS_FLAGS: curr_value = req->data.flr.flags; break; default: return -EINVAL; } trace_dev_pm_qos_update_request(dev_name(req->dev), req->type, new_value); if (curr_value != new_value) ret = apply_constraint(req, PM_QOS_UPDATE_REQ, new_value); return ret; } /** * dev_pm_qos_update_request - modifies an existing qos request * @req : handle to list element holding a dev_pm_qos request to use * @new_value: defines the qos request * * Updates an existing dev PM qos request along with updating the * target value. * * Attempts are made to make this code callable on hot code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_update_request(req, new_value); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_request); static int __dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; if (!req) /*guard against callers passing in null */ return -EINVAL; if (WARN(!dev_pm_qos_request_active(req), "%s() called for unknown object\n", __func__)) return -EINVAL; if (IS_ERR_OR_NULL(req->dev->power.qos)) return -ENODEV; trace_dev_pm_qos_remove_request(dev_name(req->dev), req->type, PM_QOS_DEFAULT_VALUE); ret = apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); memset(req, 0, sizeof(*req)); return ret; } /** * dev_pm_qos_remove_request - modifies an existing qos request * @req: handle to request list element * * Will remove pm qos request from the list of constraints and * recompute the current target value. Call this on slow code paths. * * Returns 1 if the aggregated constraint value has changed, * 0 if the aggregated constraint value has not changed, * -EINVAL in case of wrong parameters, -ENODEV if the device has been * removed from the system * * Callers should ensure that the target device is not RPM_SUSPENDED before * using this function for requests of type DEV_PM_QOS_FLAGS. */ int dev_pm_qos_remove_request(struct dev_pm_qos_request *req) { int ret; mutex_lock(&dev_pm_qos_mtx); ret = __dev_pm_qos_remove_request(req); mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request); /** * dev_pm_qos_add_notifier - sets notification entry for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block managed by caller. * @type: request type. * * Will register the notifier into a notification chain that gets called * upon changes to the target value for the device. * * If the device's constraints object doesn't exist when this routine is called, * it will be created (or error code will be returned if that fails). */ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR(dev->power.qos)) ret = -ENODEV; else if (!dev->power.qos) ret = dev_pm_qos_constraints_allocate(dev); if (ret) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_add_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier); /** * dev_pm_qos_remove_notifier - deletes notification for changes to target value * of per-device PM QoS constraints * * @dev: target device for the constraint * @notifier: notifier block to be removed. * @type: request type. * * Will remove the notifier from the notification chain that gets called * upon changes to the target value. */ int dev_pm_qos_remove_notifier(struct device *dev, struct notifier_block *notifier, enum dev_pm_qos_req_type type) { int ret = 0; mutex_lock(&dev_pm_qos_mtx); /* Silently return if the constraints object is not present. */ if (IS_ERR_OR_NULL(dev->power.qos)) goto unlock; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers, notifier); break; case DEV_PM_QOS_MIN_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MIN, notifier); break; case DEV_PM_QOS_MAX_FREQUENCY: ret = freq_qos_remove_notifier(&dev->power.qos->freq, FREQ_QOS_MAX, notifier); break; default: WARN_ON(1); ret = -EINVAL; } unlock: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier); /** * dev_pm_qos_add_ancestor_request - Add PM QoS request for device's ancestor. * @dev: Device whose ancestor to add the request for. * @req: Pointer to the preallocated handle. * @type: Type of the request. * @value: Constraint latency value. */ int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, enum dev_pm_qos_req_type type, s32 value) { struct device *ancestor = dev->parent; int ret = -ENODEV; switch (type) { case DEV_PM_QOS_RESUME_LATENCY: while (ancestor && !ancestor->power.ignore_children) ancestor = ancestor->parent; break; case DEV_PM_QOS_LATENCY_TOLERANCE: while (ancestor && !ancestor->power.set_latency_tolerance) ancestor = ancestor->parent; break; default: ancestor = NULL; } if (ancestor) ret = dev_pm_qos_add_request(ancestor, req, type, value); if (ret < 0) req->dev = NULL; return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_add_ancestor_request); static void __dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { struct dev_pm_qos_request *req = NULL; switch(type) { case DEV_PM_QOS_RESUME_LATENCY: req = dev->power.qos->resume_latency_req; dev->power.qos->resume_latency_req = NULL; break; case DEV_PM_QOS_LATENCY_TOLERANCE: req = dev->power.qos->latency_tolerance_req; dev->power.qos->latency_tolerance_req = NULL; break; case DEV_PM_QOS_FLAGS: req = dev->power.qos->flags_req; dev->power.qos->flags_req = NULL; break; default: WARN_ON(1); return; } __dev_pm_qos_remove_request(req); kfree(req); } static void dev_pm_qos_drop_user_request(struct device *dev, enum dev_pm_qos_req_type type) { mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_drop_user_request(dev, type); mutex_unlock(&dev_pm_qos_mtx); } /** * dev_pm_qos_expose_latency_limit - Expose PM QoS latency limit to user space. * @dev: Device whose PM QoS latency limit is to be exposed to user space. * @value: Initial value of the latency limit. */ int dev_pm_qos_expose_latency_limit(struct device *dev, s32 value) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev) || value < 0) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_RESUME_LATENCY, value); if (ret < 0) { kfree(req); return ret; } mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->resume_latency_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->resume_latency_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_resume_latency(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_limit); static void __dev_pm_qos_hide_latency_limit(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->resume_latency_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_RESUME_LATENCY); } /** * dev_pm_qos_hide_latency_limit - Hide PM QoS latency limit from user space. * @dev: Device whose PM QoS latency limit is to be hidden from user space. */ void dev_pm_qos_hide_latency_limit(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_resume_latency(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_latency_limit(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_limit); /** * dev_pm_qos_expose_flags - Expose PM QoS flags of a device to user space. * @dev: Device whose PM QoS flags are to be exposed to user space. * @val: Initial values of the flags. */ int dev_pm_qos_expose_flags(struct device *dev, s32 val) { struct dev_pm_qos_request *req; int ret; if (!device_is_registered(dev)) return -EINVAL; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_FLAGS, val); if (ret < 0) { kfree(req); return ret; } pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos)) ret = -ENODEV; else if (dev->power.qos->flags_req) ret = -EEXIST; if (ret < 0) { __dev_pm_qos_remove_request(req); kfree(req); mutex_unlock(&dev_pm_qos_mtx); goto out; } dev->power.qos->flags_req = req; mutex_unlock(&dev_pm_qos_mtx); ret = pm_qos_sysfs_add_flags(dev); if (ret) dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); out: mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_flags); static void __dev_pm_qos_hide_flags(struct device *dev) { if (!IS_ERR_OR_NULL(dev->power.qos) && dev->power.qos->flags_req) __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_FLAGS); } /** * dev_pm_qos_hide_flags - Hide PM QoS flags of a device from user space. * @dev: Device whose PM QoS flags are to be hidden from user space. */ void dev_pm_qos_hide_flags(struct device *dev) { pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_flags(dev); mutex_lock(&dev_pm_qos_mtx); __dev_pm_qos_hide_flags(dev); mutex_unlock(&dev_pm_qos_mtx); mutex_unlock(&dev_pm_qos_sysfs_mtx); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_flags); /** * dev_pm_qos_update_flags - Update PM QoS flags request owned by user space. * @dev: Device to update the PM QoS flags request for. * @mask: Flags to set/clear. * @set: Whether to set or clear the flags (true means set). */ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set) { s32 value; int ret; pm_runtime_get_sync(dev); mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->flags_req) { ret = -EINVAL; goto out; } value = dev_pm_qos_requested_flags(dev); if (set) value |= mask; else value &= ~mask; ret = __dev_pm_qos_update_request(dev->power.qos->flags_req, value); out: mutex_unlock(&dev_pm_qos_mtx); pm_runtime_put(dev); return ret; } /** * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance. * @dev: Device to obtain the user space latency tolerance for. */ s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev) { s32 ret; mutex_lock(&dev_pm_qos_mtx); ret = IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req ? PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT : dev->power.qos->latency_tolerance_req->data.pnode.prio; mutex_unlock(&dev_pm_qos_mtx); return ret; } /** * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance. * @dev: Device to update the user space latency tolerance for. * @val: New user space latency tolerance for @dev (negative values disable). */ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) { int ret; mutex_lock(&dev_pm_qos_mtx); if (IS_ERR_OR_NULL(dev->power.qos) || !dev->power.qos->latency_tolerance_req) { struct dev_pm_qos_request *req; if (val < 0) { if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) ret = 0; else ret = -EINVAL; goto out; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { ret = -ENOMEM; goto out; } ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val); if (ret < 0) { kfree(req); goto out; } dev->power.qos->latency_tolerance_req = req; } else { if (val < 0) { __dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE); ret = 0; } else { ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val); } } out: mutex_unlock(&dev_pm_qos_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace * @dev: Device whose latency tolerance to expose */ int dev_pm_qos_expose_latency_tolerance(struct device *dev) { int ret; if (!dev->power.set_latency_tolerance) return -EINVAL; mutex_lock(&dev_pm_qos_sysfs_mtx); ret = pm_qos_sysfs_add_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); return ret; } EXPORT_SYMBOL_GPL(dev_pm_qos_expose_latency_tolerance); /** * dev_pm_qos_hide_latency_tolerance - Hide latency tolerance from userspace * @dev: Device whose latency tolerance to hide */ void dev_pm_qos_hide_latency_tolerance(struct device *dev) { mutex_lock(&dev_pm_qos_sysfs_mtx); pm_qos_sysfs_remove_latency_tolerance(dev); mutex_unlock(&dev_pm_qos_sysfs_mtx); /* Remove the request from user space now */ pm_runtime_get_sync(dev); dev_pm_qos_update_user_latency_tolerance(dev, PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT); pm_runtime_put(dev); } EXPORT_SYMBOL_GPL(dev_pm_qos_hide_latency_tolerance);
17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 545 544 542 532 545 532 1 1 1 1 1 1 1 1 1 50 17 17 17 17 1 50 50 50 47 48 50 50 50 50 50 47 50 50 50 50 50 50 50 50 50 50 13 45 45 44 44 45 13 13 50 13 18 8 2 1 1 2 2 17 8 9 17 3 4 3 3 3 3 1 3 3 4 1 12 11 1 12 2 10 12 498 493 492 494 497 497 499 497 486 489 488 493 493 486 488 493 494 486 486 486 501 494 2 2 1 496 2 2 496 496 495 501 505 486 1 489 12 12 12 2 10 10 10 502 1 490 497 497 493 492 501 501 493 490 499 501 492 501 491 500 501 493 36 494 485 494 501 496 488 3 39 40 37 500 493 498 501 504 507 17 27 25 25 25 25 23 25 25 25 8 8 3 501 496 3 3 3 23 5 4 21 2 1 1 19 19 18 18 10 17 17 17 17 17 17 3 14 17 17 1 4 4 3 3 2 3 1 3 2 4 1 3 2 3 4 1 1 1 2 2 1 1 2 1 1 1 40 31 29 26 27 25 2 25 2 33 2 40 33 23 23 23 17 24 3 3 2 1 20 21 21 1 1 1 1 4 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 7 30 21 40 2 2 2 2 2 3 3 3 1 1 1 13 5 1 1 1 6 2 2 1 2 2 2 2 2 2 2 2 230 229 25 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 // SPDX-License-Identifier: GPL-2.0-or-later /* * TUN - Universal TUN/TAP device driver. * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com> * * $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $ */ /* * Changes: * * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14 * Add TUNSETLINK ioctl to set the link encapsulation * * Mark Smith <markzzzsmith@yahoo.com.au> * Use eth_random_addr() for tap MAC address. * * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 * Fixes in packet dropping, queue length setting and queue wakeup. * Increased default tx queue length. * Added ethtool API. * Minor cleanups * * Daniel Podlejski <underley@underley.eu.org> * Modifications for 2.3.99-pre5 kernel. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define DRV_NAME "tun" #define DRV_VERSION "1.6" #define DRV_DESCRIPTION "Universal TUN/TAP device driver" #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/fcntl.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/miscdevice.h> #include <linux/ethtool.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_tun.h> #include <linux/if_vlan.h> #include <linux/crc32.h> #include <linux/math.h> #include <linux/nsproxy.h> #include <linux/virtio_net.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <net/ip_tunnels.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/skb_array.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/mutex.h> #include <linux/ieee802154.h> #include <uapi/linux/if_ltalk.h> #include <uapi/linux/if_fddi.h> #include <uapi/linux/if_hippi.h> #include <uapi/linux/if_fc.h> #include <net/ax25.h> #include <net/rose.h> #include <net/6lowpan.h> #include <net/rps.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> #include "tun_vnet.h" static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* TUN device flags */ /* IFF_ATTACH_QUEUE is never stored in device flags, * overload it to mean fasync when stored there. */ #define TUN_FASYNC IFF_ATTACH_QUEUE #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 struct tap_filter { unsigned int count; /* Number of addrs. Zero means disabled */ u32 mask[2]; /* Mask of the hashed addrs */ unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; /* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal * to max number of VCPUs in guest. */ #define MAX_TAP_QUEUES 256 #define MAX_TAP_FLOWS 4096 #define TUN_FLOW_EXPIRE (3 * HZ) /* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and * tap_filter were kept in tun_struct since they were used for filtering for the * netdevice not for a specific queue (at least I didn't see the requirement for * this). * * RCU usage: * The tun_file and tun_struct are loosely coupled, the pointer from one to the * other can only be read while rcu_read_lock or rtnl_lock is held. */ struct tun_file { struct sock sk; struct socket socket; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; union { u16 queue_index; unsigned int ifindex; }; struct napi_struct napi; bool napi_enabled; bool napi_frags_enabled; struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; }; struct tun_page { struct page *page; int count; }; struct tun_flow_entry { struct hlist_node hash_link; struct rcu_head rcu; struct tun_struct *tun; u32 rxhash; u32 rps_rxhash; int queue_index; unsigned long updated ____cacheline_aligned_in_smp; }; #define TUN_NUM_FLOW_ENTRIES 1024 #define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1) struct tun_prog { struct rcu_head rcu; struct bpf_prog *prog; }; /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. */ struct tun_struct { struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; unsigned int numqueues; unsigned int flags; kuid_t owner; kgid_t group; struct net_device *dev; netdev_features_t set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4 | \ NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM) int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; struct sock_fprog fprog; /* protected by rtnl lock */ bool filter_attached; u32 msg_enable; spinlock_t lock; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; void *security; u32 flow_count; u32 rx_batched; atomic_long_t rx_frame_errors; struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; struct ifreq *ifr; }; struct veth { __be16 h_vlan_proto; __be16 h_vlan_TCI; }; static void tun_flow_init(struct tun_struct *tun); static void tun_flow_uninit(struct tun_struct *tun); static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; struct sk_buff *skb; int received = 0; __skb_queue_head_init(&process_queue); spin_lock(&queue->lock); skb_queue_splice_tail_init(queue, &process_queue); spin_unlock(&queue->lock); while (received < budget && (skb = __skb_dequeue(&process_queue))) { napi_gro_receive(napi, skb); ++received; } if (!skb_queue_empty(&process_queue)) { spin_lock(&queue->lock); skb_queue_splice(&process_queue, queue); spin_unlock(&queue->lock); } return received; } static int tun_napi_poll(struct napi_struct *napi, int budget) { unsigned int received; received = tun_napi_receive(napi, budget); if (received < budget) napi_complete_done(napi, received); return received; } static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, bool napi_en, bool napi_frags) { tfile->napi_enabled = napi_en; tfile->napi_frags_enabled = napi_en && napi_frags; if (napi_en) { netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll); napi_enable(&tfile->napi); } } static void tun_napi_enable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_enable(&tfile->napi); } static void tun_napi_disable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_disable(&tfile->napi); } static void tun_napi_del(struct tun_file *tfile) { if (tfile->napi_enabled) netif_napi_del(&tfile->napi); } static bool tun_napi_frags_enabled(const struct tun_file *tfile) { return tfile->napi_frags_enabled; } static inline u32 tun_hashfn(u32 rxhash) { return rxhash & TUN_MASK_FLOW_ENTRIES; } static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) { struct tun_flow_entry *e; hlist_for_each_entry_rcu(e, head, hash_link) { if (e->rxhash == rxhash) return e; } return NULL; } static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, struct hlist_head *head, u32 rxhash, u16 queue_index) { struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e) { netif_info(tun, tx_queued, tun->dev, "create flow: hash %u index %u\n", rxhash, queue_index); e->updated = jiffies; e->rxhash = rxhash; e->rps_rxhash = 0; e->queue_index = queue_index; e->tun = tun; hlist_add_head_rcu(&e->hash_link, head); ++tun->flow_count; } return e; } static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n", e->rxhash, e->queue_index); hlist_del_rcu(&e->hash_link); kfree_rcu(e, rcu); --tun->flow_count; } static void tun_flow_flush(struct tun_struct *tun) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) tun_flow_delete(tun, e); } spin_unlock_bh(&tun->lock); } static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { if (e->queue_index == queue_index) tun_flow_delete(tun, e); } } spin_unlock_bh(&tun->lock); } static void tun_flow_cleanup(struct timer_list *t) { struct tun_struct *tun = timer_container_of(tun, t, flow_gc_timer); unsigned long delay = tun->ageing_time; unsigned long next_timer = jiffies + delay; unsigned long count = 0; int i; spin_lock(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { unsigned long this_timer; this_timer = e->updated + delay; if (time_before_eq(this_timer, jiffies)) { tun_flow_delete(tun, e); continue; } count++; if (time_before(this_timer, next_timer)) next_timer = this_timer; } } if (count) mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); spin_unlock(&tun->lock); } static void tun_flow_update(struct tun_struct *tun, u32 rxhash, struct tun_file *tfile) { struct hlist_head *head; struct tun_flow_entry *e; unsigned long delay = tun->ageing_time; u16 queue_index = tfile->queue_index; head = &tun->flows[tun_hashfn(rxhash)]; rcu_read_lock(); e = tun_flow_find(head, rxhash); if (likely(e)) { /* TODO: keep queueing to old queue until it's empty? */ if (READ_ONCE(e->queue_index) != queue_index) WRITE_ONCE(e->queue_index, queue_index); if (e->updated != jiffies) e->updated = jiffies; sock_rps_record_flow_hash(e->rps_rxhash); } else { spin_lock_bh(&tun->lock); if (!tun_flow_find(head, rxhash) && tun->flow_count < MAX_TAP_FLOWS) tun_flow_create(tun, head, rxhash, queue_index); if (!timer_pending(&tun->flow_gc_timer)) mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + delay)); spin_unlock_bh(&tun->lock); } rcu_read_unlock(); } /* Save the hash received in the stack receive path and update the * flow_hash table accordingly. */ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) { if (unlikely(e->rps_rxhash != hash)) e->rps_rxhash = hash; } /* We try to identify a flow through its rxhash. The reason that * we do not check rxq no. is because some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As * the userspace application move between processors, we may get a * different rxq no. here. */ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_flow_entry *e; u32 txq, numqueues; numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); if (e) { tun_flow_save_rps_rxhash(e, txq); txq = e->queue_index; } else { txq = reciprocal_scale(txq, numqueues); } return txq; } static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_prog *prog; u32 numqueues; u16 ret = 0; numqueues = READ_ONCE(tun->numqueues); if (!numqueues) return 0; prog = rcu_dereference(tun->steering_prog); if (prog) ret = bpf_prog_run_clear_cb(prog->prog, skb); return ret % numqueues; } static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct tun_struct *tun = netdev_priv(dev); u16 ret; rcu_read_lock(); if (rcu_dereference(tun->steering_prog)) ret = tun_ebpf_select_queue(tun, skb); else ret = tun_automq_select_queue(tun, skb); rcu_read_unlock(); return ret; } static inline bool tun_not_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) || (gid_valid(tun->group) && !in_egroup_p(tun->group))) && !ns_capable(net->user_ns, CAP_NET_ADMIN); } static void tun_set_real_num_queues(struct tun_struct *tun) { netif_set_real_num_tx_queues(tun->dev, tun->numqueues); netif_set_real_num_rx_queues(tun->dev, tun->numqueues); } static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile) { tfile->detached = tun; list_add_tail(&tfile->next, &tun->disabled); ++tun->numdisabled; } static struct tun_struct *tun_enable_queue(struct tun_file *tfile) { struct tun_struct *tun = tfile->detached; tfile->detached = NULL; list_del_init(&tfile->next); --tun->numdisabled; return tun; } void tun_ptr_free(void *ptr) { if (!ptr) return; if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); xdp_return_frame(xdpf); } else { __skb_array_destroy_skb(ptr); } } EXPORT_SYMBOL_GPL(tun_ptr_free); static void tun_queue_purge(struct tun_file *tfile) { void *ptr; while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) tun_ptr_free(ptr); skb_queue_purge(&tfile->sk.sk_write_queue); skb_queue_purge(&tfile->sk.sk_error_queue); } static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; struct tun_struct *tun; tun = rtnl_dereference(tfile->tun); if (tun && clean) { if (!tfile->detached) tun_napi_disable(tfile); tun_napi_del(tfile); } if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); rcu_assign_pointer(tun->tfiles[index], tun->tfiles[tun->numqueues - 1]); ntfile = rtnl_dereference(tun->tfiles[index]); ntfile->queue_index = index; ntfile->xdp_rxq.queue_index = index; rcu_assign_pointer(tun->tfiles[tun->numqueues - 1], NULL); --tun->numqueues; if (clean) { RCU_INIT_POINTER(tfile->tun, NULL); sock_put(&tfile->sk); } else { tun_disable_queue(tun, tfile); tun_napi_disable(tfile); } synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); /* Drop read queue */ tun_queue_purge(tfile); tun_set_real_num_queues(tun); } else if (tfile->detached && clean) { tun = tun_enable_queue(tfile); sock_put(&tfile->sk); } if (clean) { if (tun && tun->numqueues == 0 && tun->numdisabled == 0) { netif_carrier_off(tun->dev); if (!(tun->flags & IFF_PERSIST) && tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } if (tun) xdp_rxq_info_unreg(&tfile->xdp_rxq); ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); } } static void tun_detach(struct tun_file *tfile, bool clean) { struct tun_struct *tun; struct net_device *dev; rtnl_lock(); tun = rtnl_dereference(tfile->tun); dev = tun ? tun->dev : NULL; __tun_detach(tfile, clean); if (dev) netdev_state_change(dev); rtnl_unlock(); if (clean) sock_put(&tfile->sk); } static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); tun_napi_disable(tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); --tun->numqueues; } list_for_each_entry(tfile, &tun->disabled, next) { tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); } BUG_ON(tun->numqueues != 0); synchronize_net(); for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tun_napi_del(tfile); /* Drop read queue */ tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_napi_del(tfile); tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter, bool napi, bool napi_frags, bool publish_tun) { struct tun_file *tfile = file->private_data; struct net_device *dev = tun->dev; int err; err = security_tun_dev_attach(tfile->socket.sk, tun->security); if (err < 0) goto out; err = -EINVAL; if (rtnl_dereference(tfile->tun) && !tfile->detached) goto out; err = -EBUSY; if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1) goto out; err = -E2BIG; if (!tfile->detached && tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES) goto out; err = 0; /* Re-attach the filter to persist device */ if (!skip_filter && (tun->filter_attached == true)) { lock_sock(tfile->socket.sk); err = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (!err) goto out; } if (!tfile->detached && ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free)) { err = -ENOMEM; goto out; } tfile->queue_index = tun->numqueues; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; if (tfile->detached) { /* Re-attach detached tfile, updating XDP queue_index */ WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq)); if (tfile->xdp_rxq.queue_index != tfile->queue_index) tfile->xdp_rxq.queue_index = tfile->queue_index; } else { /* Setup XDP RX-queue info, for new tfile getting attached */ err = xdp_rxq_info_reg(&tfile->xdp_rxq, tun->dev, tfile->queue_index, 0); if (err < 0) goto out; err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) { xdp_rxq_info_unreg(&tfile->xdp_rxq); goto out; } err = 0; } if (tfile->detached) { tun_enable_queue(tfile); tun_napi_enable(tfile); } else { sock_hold(&tfile->sk); tun_napi_init(tun, tfile, napi, napi_frags); } if (rtnl_dereference(tun->xdp_prog)) sock_set_flag(&tfile->sk, SOCK_XDP); /* device is allowed to go away first, so no need to hold extra * refcnt. */ /* Publish tfile->tun and tun->tfiles only after we've fully * initialized tfile; otherwise we risk using half-initialized * object. */ if (publish_tun) rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; tun_set_real_num_queues(tun); out: return err; } static struct tun_struct *tun_get(struct tun_file *tfile) { struct tun_struct *tun; rcu_read_lock(); tun = rcu_dereference(tfile->tun); if (tun) dev_hold(tun->dev); rcu_read_unlock(); return tun; } static void tun_put(struct tun_struct *tun) { dev_put(tun->dev); } /* TAP filtering */ static void addr_hash_set(u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; mask[n >> 5] |= (1 << (n & 31)); } static unsigned int addr_hash_test(const u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; return mask[n >> 5] & (1 << (n & 31)); } static int update_filter(struct tap_filter *filter, void __user *arg) { struct { u8 u[ETH_ALEN]; } *addr; struct tun_filter uf; int err, alen, n, nexact; if (copy_from_user(&uf, arg, sizeof(uf))) return -EFAULT; if (!uf.count) { /* Disabled */ filter->count = 0; return 0; } alen = ETH_ALEN * uf.count; addr = memdup_user(arg + sizeof(uf), alen); if (IS_ERR(addr)) return PTR_ERR(addr); /* The filter is updated without holding any locks. Which is * perfectly safe. We disable it first and in the worst * case we'll accept a few undesired packets. */ filter->count = 0; wmb(); /* Use first set of addresses as an exact filter */ for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++) memcpy(filter->addr[n], addr[n].u, ETH_ALEN); nexact = n; /* Remaining multicast addresses are hashed, * unicast will leave the filter disabled. */ memset(filter->mask, 0, sizeof(filter->mask)); for (; n < uf.count; n++) { if (!is_multicast_ether_addr(addr[n].u)) { err = 0; /* no filter */ goto free_addr; } addr_hash_set(filter->mask, addr[n].u); } /* For ALLMULTI just set the mask to all ones. * This overrides the mask populated above. */ if ((uf.flags & TUN_FLT_ALLMULTI)) memset(filter->mask, ~0, sizeof(filter->mask)); /* Now enable the filter */ wmb(); filter->count = nexact; /* Return the number of exact filters */ err = nexact; free_addr: kfree(addr); return err; } /* Returns: 0 - drop, !=0 - accept */ static int run_filter(struct tap_filter *filter, const struct sk_buff *skb) { /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect * at this point. */ struct ethhdr *eh = (struct ethhdr *) skb->data; int i; /* Exact match */ for (i = 0; i < filter->count; i++) if (ether_addr_equal(eh->h_dest, filter->addr[i])) return 1; /* Inexact match (multicast only) */ if (is_multicast_ether_addr(eh->h_dest)) return addr_hash_test(filter->mask, eh->h_dest); return 0; } /* * Checks whether the packet is accepted or not. * Returns: 0 - drop, !=0 - accept */ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb) { if (!filter->count) return 1; return run_filter(filter, skb); } /* Network device part of the driver */ static const struct ethtool_ops tun_ethtool_ops; static int tun_net_init(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct ifreq *ifr = tun->ifr; int err; spin_lock_init(&tun->lock); err = security_tun_dev_alloc_security(&tun->security); if (err < 0) return err; tun_flow_init(tun); dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->hw_enc_features = dev->hw_features; dev->features = dev->hw_features; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); dev->lltx = true; tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, false); if (err < 0) { tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); return err; } return 0; } /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { tun_detach_all(dev); } /* Net device open. */ static int tun_net_open(struct net_device *dev) { netif_tx_start_all_queues(dev); return 0; } /* Net device close. */ static int tun_net_close(struct net_device *dev) { netif_tx_stop_all_queues(dev); return 0; } /* Net device start xmit */ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { #ifdef CONFIG_RPS if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ struct tun_flow_entry *e; __u32 rxhash; rxhash = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash); if (e) tun_flow_save_rps_rxhash(e, rxhash); } #endif } static unsigned int run_ebpf_filter(struct tun_struct *tun, struct sk_buff *skb, int len) { struct tun_prog *prog = rcu_dereference(tun->filter_prog); if (prog) len = bpf_prog_run_clear_cb(prog->prog, skb); return len; } /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct tun_struct *tun = netdev_priv(dev); int txq = skb->queue_mapping; struct netdev_queue *queue; struct tun_file *tfile; int len = skb->len; rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); /* Drop packet if interface is not attached */ if (!tfile) { drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (!rcu_dereference(tun->steering_prog)) tun_automq_xmit(tun, skb); netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len); /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ if (!check_filter(&tun->txflt, skb)) { drop_reason = SKB_DROP_REASON_TAP_TXFILTER; goto drop; } if (tfile->socket.sk->sk_filter && sk_filter_reason(tfile->socket.sk, skb, &drop_reason)) goto drop; len = run_ebpf_filter(tun, skb, len); if (len == 0) { drop_reason = SKB_DROP_REASON_TAP_FILTER; goto drop; } if (pskb_trim(skb, len)) { drop_reason = SKB_DROP_REASON_NOMEM; goto drop; } if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. */ skb_orphan(skb); nf_reset_ct(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } /* dev->lltx requires to do our own update of trans_start */ queue = netdev_get_tx_queue(dev, txq); txq_trans_cond_update(queue); /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); rcu_read_unlock(); return NETDEV_TX_OK; drop: dev_core_stats_tx_dropped_inc(dev); skb_tx_error(skb); kfree_skb_reason(skb, drop_reason); rcu_read_unlock(); return NET_XMIT_DROP; } static void tun_net_mclist(struct net_device *dev) { /* * This callback is supposed to deal with mc filter in * _rx_ path and has nothing to do with the _tx_ path. * In rx path we always accept everything userspace gives us. */ } static netdev_features_t tun_net_fix_features(struct net_device *dev, netdev_features_t features) { struct tun_struct *tun = netdev_priv(dev); return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } static void tun_set_headroom(struct net_device *dev, int new_hr) { struct tun_struct *tun = netdev_priv(dev); if (new_hr < NET_SKB_PAD) new_hr = NET_SKB_PAD; tun->align = new_hr; } static void tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct tun_struct *tun = netdev_priv(dev); dev_get_tstats64(dev, stats); stats->rx_frame_errors += (unsigned long)atomic_long_read(&tun->rx_frame_errors); } static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; struct bpf_prog *old_prog; int i; old_prog = rtnl_dereference(tun->xdp_prog); rcu_assign_pointer(tun->xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } list_for_each_entry(tfile, &tun->disabled, next) { if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } return 0; } static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) { if (new_carrier) { struct tun_struct *tun = netdev_priv(dev); if (!tun->numqueues) return -EPERM; netif_carrier_on(dev); } else { netif_carrier_off(dev); } return 0; } static const struct net_device_ops tun_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_select_queue = tun_select_queue, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, .ndo_change_carrier = tun_net_change_carrier, }; static void __tun_xdp_flush_tfile(struct tun_file *tfile) { /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); } static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; u32 numqueues; int nxmit = 0; int i; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; rcu_read_lock(); resample: numqueues = READ_ONCE(tun->numqueues); if (!numqueues) { rcu_read_unlock(); return -ENXIO; /* Caller will free/return all frames */ } tfile = rcu_dereference(tun->tfiles[smp_processor_id() % numqueues]); if (unlikely(!tfile)) goto resample; spin_lock(&tfile->tx_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *xdp = frames[i]; /* Encode the XDP flag into lowest bit for consumer to differ * XDP buffer from sk_buff. */ void *frame = tun_xdp_to_ptr(xdp); if (__ptr_ring_produce(&tfile->tx_ring, frame)) { dev_core_stats_tx_dropped_inc(dev); break; } nxmit++; } spin_unlock(&tfile->tx_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __tun_xdp_flush_tfile(tfile); rcu_read_unlock(); return nxmit; } static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); int nxmit; if (unlikely(!frame)) return -EOVERFLOW; nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH); if (!nxmit) xdp_return_frame_rx_napi(frame); return nxmit; } static const struct net_device_ops tap_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_set_rx_mode = tun_net_mclist, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_select_queue = tun_select_queue, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_bpf = tun_xdp, .ndo_xdp_xmit = tun_xdp_xmit, .ndo_change_carrier = tun_net_change_carrier, }; static void tun_flow_init(struct tun_struct *tun) { int i; for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) INIT_HLIST_HEAD(&tun->flows[i]); tun->ageing_time = TUN_FLOW_EXPIRE; timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0); mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + tun->ageing_time)); } static void tun_flow_uninit(struct tun_struct *tun) { timer_delete_sync(&tun->flow_gc_timer); tun_flow_flush(tun); } #define MIN_MTU 68 #define MAX_MTU 65535 /* Initialize net device. */ static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; /* Point-to-Point TUN Device */ dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = 1500; /* Zero header length */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; break; case IFF_TAP: dev->netdev_ops = &tap_netdev_ops; /* Ethernet TAP Device */ ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; eth_hw_addr_random(dev); /* Currently tun does not support XDP, only tap does. */ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_NDO_XMIT; break; } dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU - dev->hard_header_len; } static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile) { struct sock *sk = tfile->socket.sk; return (tun->dev->flags & IFF_UP) && sock_writeable(sk); } /* Character device part */ /* Poll */ static __poll_t tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); struct sock *sk; __poll_t mask = 0; if (!tun) return EPOLLERR; sk = tfile->socket.sk; poll_wait(file, sk_sleep(sk), wait); if (!ptr_ring_empty(&tfile->tx_ring)) mask |= EPOLLIN | EPOLLRDNORM; /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to * guarantee EPOLLOUT to be raised by either here or * tun_sock_write_space(). Then process could get notification * after it writes to a down device and meets -EIO. */ if (tun_sock_writeable(tun, tfile) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && tun_sock_writeable(tun, tfile))) mask |= EPOLLOUT | EPOLLWRNORM; if (tun->dev->reg_state != NETREG_REGISTERED) mask = EPOLLERR; tun_put(tun); return mask; } static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, size_t len, const struct iov_iter *it) { struct sk_buff *skb; size_t linear; int err; int i; if (it->nr_segs > MAX_SKB_FRAGS + 1 || len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN)) return ERR_PTR(-EMSGSIZE); local_bh_disable(); skb = napi_get_frags(&tfile->napi); local_bh_enable(); if (!skb) return ERR_PTR(-ENOMEM); linear = iov_iter_single_seg_count(it); err = __skb_grow(skb, linear); if (err) goto free; skb->len = len; skb->data_len = len - linear; skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { const struct iovec *iov = iter_iov(it) + i; size_t fragsz = iov->iov_len; struct page *page; void *frag; if (fragsz == 0 || fragsz > PAGE_SIZE) { err = -EINVAL; goto free; } frag = netdev_alloc_frag(fragsz); if (!frag) { err = -ENOMEM; goto free; } page = virt_to_head_page(frag); skb_fill_page_desc(skb, i - 1, page, frag - page_address(page), fragsz); } return skb; free: /* frees skb and all frags allocated with napi_alloc_frag() */ napi_free_frags(&tfile->napi); return ERR_PTR(err); } /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, size_t prepad, size_t len, size_t linear, int noblock) { struct sock *sk = tfile->socket.sk; struct sk_buff *skb; int err; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, &err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return ERR_PTR(err); skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, int more) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; u32 rx_batched = tun->rx_batched; bool rcv = false; if (!rx_batched || (!more && skb_queue_empty(queue))) { local_bh_disable(); skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); return; } spin_lock(&queue->lock); if (!more || skb_queue_len(queue) == rx_batched) { __skb_queue_head_init(&process_queue); skb_queue_splice_tail_init(queue, &process_queue); rcv = true; } else { __skb_queue_tail(queue, skb); } spin_unlock(&queue->lock); if (rcv) { struct sk_buff *nskb; local_bh_disable(); while ((nskb = __skb_dequeue(&process_queue))) { skb_record_rx_queue(nskb, tfile->queue_index); netif_receive_skb(nskb); } skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); } } static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, int len, int noblock, bool zerocopy) { if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) return false; if (tfile->socket.sk->sk_sndbuf != INT_MAX) return false; if (!noblock) return false; if (zerocopy) return false; if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return false; return true; } static struct sk_buff *__tun_build_skb(struct tun_file *tfile, struct page_frag *alloc_frag, char *buf, int buflen, int len, int pad, int metasize) { struct sk_buff *skb = build_skb(buf, buflen); if (!skb) return ERR_PTR(-ENOMEM); skb_reserve(skb, pad); skb_put(skb, len); if (metasize) skb_metadata_set(skb, metasize); skb_set_owner_w(skb, tfile->socket.sk); get_page(alloc_frag->page); alloc_frag->offset += buflen; return skb; } static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, struct xdp_buff *xdp, u32 act) { int err; switch (act) { case XDP_REDIRECT: err = xdp_do_redirect(tun->dev, xdp, xdp_prog); if (err) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_TX: err = tun_xdp_tx(tun->dev, xdp); if (err < 0) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_PASS: break; default: bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tun->dev, xdp_prog, act); fallthrough; case XDP_DROP: dev_core_stats_rx_dropped_inc(tun->dev); break; } return act; } static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *from, struct virtio_net_hdr *hdr, int len, int *skb_xdp) { struct page_frag *alloc_frag = &current->task_frag; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); char *buf; size_t copied; int pad = TUN_RX_PAD; int metasize = 0; int err = 0; rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) pad += XDP_PACKET_HEADROOM; buflen += SKB_DATA_ALIGN(len + pad); rcu_read_unlock(); alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) return ERR_PTR(-ENOMEM); buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + pad, len, from); if (copied != len) return ERR_PTR(-EFAULT); /* There's a small window that XDP may be set after the check * of xdp_prog above, this should be rare and for simplicity * we do XDP on skb in case the headroom is not enough. */ if (hdr->gso_type || !xdp_prog) { *skb_xdp = 1; return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad, metasize); } *skb_xdp = 0; local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { struct xdp_buff xdp; u32 act; xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); xdp_prepare_buff(&xdp, buf, pad, len, true); act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { get_page(alloc_frag->page); alloc_frag->offset += buflen; } err = tun_xdp_act(tun, xdp_prog, &xdp, act); if (err < 0) { if (act == XDP_REDIRECT || act == XDP_TX) put_page(alloc_frag->page); goto out; } if (err == XDP_REDIRECT) xdp_do_flush(); if (err != XDP_PASS) goto out; pad = xdp.data - xdp.data_hard_start; len = xdp.data_end - xdp.data; /* It is known that the xdp_buff was prepared with metadata * support, so the metasize will never be negative. */ metasize = xdp.data - xdp.data_meta; } bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad, metasize); out: bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return NULL; } /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, int noblock, bool more) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr_v1_hash_tunnel hdr; struct virtio_net_hdr *gso; int good_linear; int copylen; int hdr_len = 0; bool zerocopy = false; int err; u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; netdev_features_t features = 0; /* * Keep it easy and always zero the whole buffer, even if the * tunnel-related field will be touched only when the feature * is enabled and the hdr size id compatible. */ memset(&hdr, 0, sizeof(hdr)); gso = (struct virtio_net_hdr *)&hdr; if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) return -EINVAL; len -= sizeof(pi); if (!copy_from_iter_full(&pi, sizeof(pi), from)) return -EFAULT; } if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); features = tun_vnet_hdr_guest_features(vnet_hdr_sz); hdr_len = __tun_vnet_hdr_get(vnet_hdr_sz, tun->flags, features, from, gso); if (hdr_len < 0) return hdr_len; len -= vnet_hdr_sz; } if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || (hdr_len && hdr_len < ETH_HLEN))) return -EINVAL; } good_linear = SKB_MAX_HEAD(align); if (msg_control) { struct iov_iter i = *from; /* There are 256 bytes to be copied in skb, so there is * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ copylen = min(hdr_len ? hdr_len : GOODCOPY_LEN, good_linear); linear = copylen; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. */ skb = tun_build_skb(tun, tfile, from, gso, len, &skb_xdp); err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (!skb) return total_len; } else { if (!zerocopy) { copylen = len; linear = min(hdr_len, good_linear); } if (frags) { mutex_lock(&tfile->napi_mutex); skb = tun_napi_alloc_frags(tfile, copylen, from); /* tun_napi_alloc_frags() enforces a layout for the skb. * If zerocopy is enabled, then this layout will be * overwritten by zerocopy_sg_from_iter(). */ zerocopy = false; } else { if (!linear) linear = min_t(size_t, good_linear, copylen); skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); } err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { err = -EFAULT; drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } } if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, &hdr)) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; } switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; switch (ip_version) { case 4: pi.proto = htons(ETH_P_IP); break; case 6: pi.proto = htons(ETH_P_IPV6); break; default: err = -EINVAL; goto drop; } } skb_reset_mac_header(skb); skb->protocol = pi.proto; skb->dev = tun->dev; break; case IFF_TAP: if (frags && !pskb_may_pull(skb, ETH_HLEN)) { err = -ENOMEM; drop_reason = SKB_DROP_REASON_HDR_TRUNC; goto drop; } skb->protocol = eth_type_trans(skb, tun->dev); break; } /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->ops->complete(NULL, uarg, false); } skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { struct bpf_prog *xdp_prog; int ret; local_bh_disable(); rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { rcu_read_unlock(); local_bh_enable(); goto unlock_frags; } if (frags && skb != tfile->napi.skb) tfile->napi.skb = skb; } rcu_read_unlock(); local_bh_enable(); } /* Compute the costly rx hash only if needed for flow updates. * We may get a very small possibility of OOO during switching, not * worth to optimize. */ if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); rcu_read_lock(); if (unlikely(!(tun->dev->flags & IFF_UP))) { err = -EIO; rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (frags) { u32 headlen; /* Exercise flow dissector code path. */ skb_push(skb, ETH_HLEN); headlen = eth_get_headlen(tun->dev, skb->data, skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { WARN_ON_ONCE(1); err = -ENOMEM; dev_core_stats_rx_dropped_inc(tun->dev); napi_busy: napi_free_frags(&tfile->napi); rcu_read_unlock(); mutex_unlock(&tfile->napi_mutex); return err; } if (likely(napi_schedule_prep(&tfile->napi))) { local_bh_disable(); napi_gro_frags(&tfile->napi); napi_complete(&tfile->napi); local_bh_enable(); } else { err = -EBUSY; goto napi_busy; } mutex_unlock(&tfile->napi_mutex); } else if (tfile->napi_enabled) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; spin_lock_bh(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock_bh(&queue->lock); rcu_read_unlock(); err = -EBUSY; goto free_skb; } __skb_queue_tail(queue, skb); queue_len = skb_queue_len(queue); spin_unlock(&queue->lock); if (!more || queue_len > NAPI_POLL_WEIGHT) napi_schedule(&tfile->napi); local_bh_enable(); } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { tun_rx_batched(tun, tfile, skb, more); } else { netif_rx(skb); } rcu_read_unlock(); preempt_disable(); dev_sw_netstats_rx_add(tun->dev, len); preempt_enable(); if (rxhash) tun_flow_update(tun, rxhash, tfile); return total_len; drop: if (err != -EAGAIN) dev_core_stats_rx_dropped_inc(tun->dev); free_skb: if (!IS_ERR_OR_NULL(skb)) kfree_skb_reason(skb, drop_reason); unlock_frags: if (frags) { tfile->napi.skb = NULL; mutex_unlock(&tfile->napi_mutex); } return err ?: total_len; } static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t result; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; result = tun_get_user(tun, tfile, NULL, from, noblock, false); tun_put(tun); return result; } static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct tun_file *tfile, struct xdp_frame *xdp_frame, struct iov_iter *iter) { int vnet_hdr_sz = 0; size_t size = xdp_frame->len; ssize_t ret; if (tun->flags & IFF_VNET_HDR) { struct virtio_net_hdr gso = { 0 }; vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); if (ret) return ret; } ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, ret); preempt_enable(); return ret; } /* Put packet to the user space buffer */ static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; ssize_t total; int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; int ret; if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; if (tun->flags & IFF_VNET_HDR) vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); total = skb->len + vlan_hlen + vnet_hdr_sz; if (!(tun->flags & IFF_NO_PI)) { if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; total += sizeof(pi); if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT; } if (vnet_hdr_sz) { struct virtio_net_hdr_v1_hash_tunnel hdr; struct virtio_net_hdr *gso; ret = tun_vnet_hdr_tnl_from_skb(tun->flags, tun->dev, skb, &hdr); if (ret) return ret; /* * Drop the packet if the configured header size is too small * WRT the enabled offloads. */ gso = (struct virtio_net_hdr *)&hdr; ret = __tun_vnet_hdr_put(vnet_hdr_sz, tun->dev->features, iter, gso); if (ret) return ret; } if (vlan_hlen) { int ret; struct veth veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: /* caller is in process context, */ preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen); preempt_enable(); return total; } static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) { DECLARE_WAITQUEUE(wait, current); void *ptr = NULL; int error = 0; ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) goto out; if (noblock) { error = -EAGAIN; goto out; } add_wait_queue(&tfile->socket.wq.wait, &wait); while (1) { set_current_state(TASK_INTERRUPTIBLE); ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) break; if (signal_pending(current)) { error = -ERESTARTSYS; break; } if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) { error = -EFAULT; break; } schedule(); } __set_current_state(TASK_RUNNING); remove_wait_queue(&tfile->socket.wq.wait, &wait); out: *err = error; return ptr; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, int noblock, void *ptr) { ssize_t ret; int err; if (!iov_iter_count(to)) { tun_ptr_free(ptr); return 0; } if (!ptr) { /* Read frames from ring */ ptr = tun_ring_recv(tfile, noblock, &err); if (!ptr) return err; } if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); ret = tun_put_user_xdp(tun, tfile, xdpf, to); xdp_return_frame(xdpf); } else { struct sk_buff *skb = ptr; ret = tun_put_user(tun, tfile, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t len = iov_iter_count(to), ret; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tun_do_read(tun, tfile, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; tun_put(tun); return ret; } static void tun_prog_free(struct rcu_head *rcu) { struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu); bpf_prog_destroy(prog->prog); kfree(prog); } static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, struct bpf_prog *prog) { struct tun_prog *old, *new = NULL; if (prog) { new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; new->prog = prog; } spin_lock_bh(&tun->lock); old = rcu_dereference_protected(*prog_p, lockdep_is_held(&tun->lock)); rcu_assign_pointer(*prog_p, new); spin_unlock_bh(&tun->lock); if (old) call_rcu(&old->rcu, tun_prog_free); return 0; } static void tun_free_netdev(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); __tun_set_ebpf(tun, &tun->filter_prog, NULL); } static void tun_setup(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); tun->owner = INVALID_UID; tun->group = INVALID_GID; tun_default_link_ksettings(dev, &tun->link_ksettings); dev->ethtool_ops = &tun_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = tun_free_netdev; /* We prefer our own queue length */ dev->tx_queue_len = TUN_READQ_SIZE; } /* Trivial set of netlink ops to allow deleting tun or tap * device with netlink. */ static int tun_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "tun/tap creation via rtnetlink is not supported."); return -EOPNOTSUPP; } static size_t tun_get_size(const struct net_device *dev) { BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t)); BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t)); return nla_total_size(sizeof(uid_t)) + /* OWNER */ nla_total_size(sizeof(gid_t)) + /* GROUP */ nla_total_size(sizeof(u8)) + /* TYPE */ nla_total_size(sizeof(u8)) + /* PI */ nla_total_size(sizeof(u8)) + /* VNET_HDR */ nla_total_size(sizeof(u8)) + /* PERSIST */ nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */ nla_total_size(sizeof(u32)) + /* NUM_QUEUES */ nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */ 0; } static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK)) goto nla_put_failure; if (uid_valid(tun->owner) && nla_put_u32(skb, IFLA_TUN_OWNER, from_kuid_munged(current_user_ns(), tun->owner))) goto nla_put_failure; if (gid_valid(tun->group) && nla_put_u32(skb, IFLA_TUN_GROUP, from_kgid_munged(current_user_ns(), tun->group))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE, !!(tun->flags & IFF_MULTI_QUEUE))) goto nla_put_failure; if (tun->flags & IFF_MULTI_QUEUE) { if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES, tun->numdisabled)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops tun_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct tun_struct), .setup = tun_setup, .validate = tun_validate, .get_size = tun_get_size, .fill_info = tun_fill_info, }; static void tun_sock_write_space(struct sock *sk) { struct tun_file *tfile; wait_queue_head_t *wqueue; if (!sock_writeable(sk)) return; if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_sync_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); tfile = container_of(sk, struct tun_file, sk); kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } static void tun_put_page(struct tun_page *tpage) { if (tpage->page) __page_frag_cache_drain(tpage->page, tpage->count); } static int tun_xdp_one(struct tun_struct *tun, struct tun_file *tfile, struct xdp_buff *xdp, int *flush, struct tun_page *tpage) { unsigned int datasize = xdp->data_end - xdp->data; struct virtio_net_hdr *gso = xdp->data_hard_start; struct virtio_net_hdr_v1_hash_tunnel *tnl_hdr; struct bpf_prog *xdp_prog; struct sk_buff *skb = NULL; struct sk_buff_head *queue; netdev_features_t features; u32 rxhash = 0, act; int buflen = xdp->frame_sz; int metasize = 0; int ret = 0; bool skb_xdp = false; struct page *page; if (unlikely(datasize < ETH_HLEN)) return -EINVAL; xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { if (gso->gso_type) { skb_xdp = true; goto build; } xdp_init_buff(xdp, buflen, &tfile->xdp_rxq); act = bpf_prog_run_xdp(xdp_prog, xdp); ret = tun_xdp_act(tun, xdp_prog, xdp, act); if (ret < 0) { put_page(virt_to_head_page(xdp->data)); return ret; } switch (ret) { case XDP_REDIRECT: *flush = true; fallthrough; case XDP_TX: return 0; case XDP_PASS: break; default: page = virt_to_head_page(xdp->data); if (tpage->page == page) { ++tpage->count; } else { tun_put_page(tpage); tpage->page = page; tpage->count = 1; } return 0; } } build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { ret = -ENOMEM; goto out; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); /* The externally provided xdp_buff may have no metadata support, which * is marked by xdp->data_meta being xdp->data + 1. This will lead to a * metasize of -1 and is the reason why the condition checks for > 0. */ metasize = xdp->data - xdp->data_meta; if (metasize > 0) skb_metadata_set(skb, metasize); features = tun_vnet_hdr_guest_features(READ_ONCE(tun->vnet_hdr_sz)); tnl_hdr = (struct virtio_net_hdr_v1_hash_tunnel *)gso; if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, tnl_hdr)) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; goto out; } skb->protocol = eth_type_trans(skb, tun->dev); skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { ret = 0; goto out; } } if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); if (tfile->napi_enabled) { queue = &tfile->sk.sk_write_queue; spin_lock(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock(&queue->lock); kfree_skb(skb); return -EBUSY; } __skb_queue_tail(queue, skb); spin_unlock(&queue->lock); ret = 1; } else { netif_receive_skb(skb); ret = 0; } /* No need to disable preemption here since this function is * always called with bh disabled */ dev_sw_netstats_rx_add(tun->dev, datasize); if (rxhash) tun_flow_update(tun, rxhash, tfile); out: return ret; } static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret, i; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; if (!tun) return -EBADFD; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct tun_page tpage; int n = ctl->num; int flush = 0, queued = 0; memset(&tpage, 0, sizeof(tpage)); local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); for (i = 0; i < n; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage); if (ret > 0) queued += ret; } if (flush) xdp_do_flush(); if (tfile->napi_enabled && queued > 0) napi_schedule(&tfile->napi); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); tun_put_page(&tpage); ret = total_len; goto out; } ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); out: tun_put(tun); return ret; } static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); void *ptr = m->msg_control; int ret; if (!tun) { ret = -EBADFD; goto out_free; } if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { ret = -EINVAL; goto out_put_tun; } if (flags & MSG_ERRQUEUE) { ret = sock_recv_errqueue(sock->sk, m, total_len, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr); if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } out: tun_put(tun); return ret; out_put_tun: tun_put(tun); out_free: tun_ptr_free(ptr); return ret; } static int tun_ptr_peek_len(void *ptr) { if (likely(ptr)) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } else { return 0; } } static int tun_peek_len(struct socket *sock) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun; int ret = 0; tun = tun_get(tfile); if (!tun) return 0; ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); tun_put(tun); return ret; } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tun_socket_ops = { .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, }; static struct proto tun_proto = { .name = "tun", .owner = THIS_MODULE, .obj_size = sizeof(struct tun_file), }; static int tun_flags(struct tun_struct *tun) { return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP); } static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return sysfs_emit(buf, "0x%x\n", tun_flags(tun)); } static ssize_t owner_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return uid_valid(tun->owner)? sysfs_emit(buf, "%u\n", from_kuid_munged(current_user_ns(), tun->owner)) : sysfs_emit(buf, "-1\n"); } static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return gid_valid(tun->group) ? sysfs_emit(buf, "%u\n", from_kgid_munged(current_user_ns(), tun->group)) : sysfs_emit(buf, "-1\n"); } static DEVICE_ATTR_RO(tun_flags); static DEVICE_ATTR_RO(owner); static DEVICE_ATTR_RO(group); static struct attribute *tun_dev_attrs[] = { &dev_attr_tun_flags.attr, &dev_attr_owner.attr, &dev_attr_group.attr, NULL }; static const struct attribute_group tun_attr_group = { .attrs = tun_dev_attrs }; static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { struct tun_struct *tun; struct tun_file *tfile = file->private_data; struct net_device *dev; int err; if (tfile->detached) return -EINVAL; if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!(ifr->ifr_flags & IFF_NAPI) || (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) return -EINVAL; } dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) return -EBUSY; if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops) tun = netdev_priv(dev); else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops) tun = netdev_priv(dev); else return -EINVAL; if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) != !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; if (tun_not_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) return err; err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, true); if (err < 0) return err; if (tun->flags & IFF_MULTI_QUEUE && (tun->numqueues + tun->numdisabled > 1)) { /* One or more queue has already been attached, no need * to initialize the device again. */ netdev_state_change(dev); return 0; } tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); netdev_state_change(dev); } else { char *name; unsigned long flags = 0; int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = security_tun_dev_create(); if (err < 0) return err; /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ flags |= IFF_TUN; name = "tun%d"; } else if (ifr->ifr_flags & IFF_TAP) { /* TAP device */ flags |= IFF_TAP; name = "tap%d"; } else return -EINVAL; if (*ifr->ifr_name) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, NET_NAME_UNKNOWN, tun_setup, queues, queues); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; dev->ifindex = tfile->ifindex; dev->sysfs_groups[0] = &tun_attr_group; tun = netdev_priv(dev); tun->dev = dev; tun->flags = flags; tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL); tun->ifr = ifr; tun->file = file; tun_net_initialize(dev); err = register_netdevice(tun->dev); if (err < 0) { free_netdev(dev); return err; } /* free_netdev() won't check refcnt, to avoid race * with dev_put() we need publish tun after registration. */ rcu_assign_pointer(tfile->tun, tun); } if (ifr->ifr_flags & IFF_NO_CARRIER) netif_carrier_off(tun->dev); else netif_carrier_on(tun->dev); /* Make sure persistent devices do not get stuck in * xoff state. */ if (netif_running(tun->dev)) netif_tx_wake_all_queues(tun->dev); strscpy(ifr->ifr_name, tun->dev->name); return 0; } static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr) { strscpy(ifr->ifr_name, tun->dev->name); ifr->ifr_flags = tun_flags(tun); } #define PLAIN_GSO (NETIF_F_GSO_UDP_L4 | NETIF_F_TSO | NETIF_F_TSO6) /* This is like a cut-down ethtool ops, except done via tun fd so no * privs required. */ static int set_offload(struct tun_struct *tun, unsigned long arg) { netdev_features_t features = 0; if (arg & TUN_F_CSUM) { features |= NETIF_F_HW_CSUM; arg &= ~TUN_F_CSUM; if (arg & (TUN_F_TSO4|TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) { features |= NETIF_F_TSO_ECN; arg &= ~TUN_F_TSO_ECN; } if (arg & TUN_F_TSO4) features |= NETIF_F_TSO; if (arg & TUN_F_TSO6) features |= NETIF_F_TSO6; arg &= ~(TUN_F_TSO4|TUN_F_TSO6); } arg &= ~TUN_F_UFO; /* TODO: for now USO4 and USO6 should work simultaneously */ if (arg & TUN_F_USO4 && arg & TUN_F_USO6) { features |= NETIF_F_GSO_UDP_L4; arg &= ~(TUN_F_USO4 | TUN_F_USO6); } /* * Tunnel offload is allowed only if some plain offload is * available, too. */ if (features & PLAIN_GSO && arg & TUN_F_UDP_TUNNEL_GSO) { features |= NETIF_F_GSO_UDP_TUNNEL; if (arg & TUN_F_UDP_TUNNEL_GSO_CSUM) features |= NETIF_F_GSO_UDP_TUNNEL_CSUM; arg &= ~(TUN_F_UDP_TUNNEL_GSO | TUN_F_UDP_TUNNEL_GSO_CSUM); } } /* This gives the user a way to test for new features in future by * trying to set them. */ if (arg) return -EINVAL; tun->set_features = features; tun->dev->wanted_features &= ~TUN_USER_FEATURES; tun->dev->wanted_features |= features; netdev_update_features(tun->dev); return 0; } static void tun_detach_filter(struct tun_struct *tun, int n) { int i; struct tun_file *tfile; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); sk_detach_filter(tfile->socket.sk); release_sock(tfile->socket.sk); } tun->filter_attached = false; } static int tun_attach_filter(struct tun_struct *tun) { int i, ret = 0; struct tun_file *tfile; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (ret) { tun_detach_filter(tun, i); return ret; } } tun->filter_attached = true; return ret; } static void tun_set_sndbuf(struct tun_struct *tun) { struct tun_file *tfile; int i; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_sndbuf = tun->sndbuf; } } static int tun_set_queue(struct file *file, struct ifreq *ifr) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; int ret = 0; rtnl_lock(); if (ifr->ifr_flags & IFF_ATTACH_QUEUE) { tun = tfile->detached; if (!tun) { ret = -EINVAL; goto unlock; } ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI, tun->flags & IFF_NAPI_FRAGS, true); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) ret = -EINVAL; else __tun_detach(tfile, false); } else ret = -EINVAL; if (ret >= 0) netdev_state_change(tun->dev); unlock: rtnl_unlock(); return ret; } static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, void __user *data) { struct bpf_prog *prog; int fd; if (copy_from_user(&fd, data, sizeof(fd))) return -EFAULT; if (fd == -1) { prog = NULL; } else { prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) return PTR_ERR(prog); } return __tun_set_ebpf(tun, prog_p, prog); } /* Return correct value for tun->dev->addr_len based on tun->dev->type. */ static unsigned char tun_get_addr_len(unsigned short type) { switch (type) { case ARPHRD_IP6GRE: case ARPHRD_TUNNEL6: return sizeof(struct in6_addr); case ARPHRD_IPGRE: case ARPHRD_TUNNEL: case ARPHRD_SIT: return 4; case ARPHRD_ETHER: return ETH_ALEN; case ARPHRD_IEEE802154: case ARPHRD_IEEE802154_MONITOR: return IEEE802154_EXTENDED_ADDR_LEN; case ARPHRD_PHONET_PIPE: case ARPHRD_PPP: case ARPHRD_NONE: return 0; case ARPHRD_6LOWPAN: return EUI64_ADDR_LEN; case ARPHRD_FDDI: return FDDI_K_ALEN; case ARPHRD_HIPPI: return HIPPI_ALEN; case ARPHRD_IEEE802: return FC_ALEN; case ARPHRD_ROSE: return ROSE_ADDR_LEN; case ARPHRD_NETROM: return AX25_ADDR_LEN; case ARPHRD_LOCALTLK: return LTALK_ALEN; default: return 0; } } static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { struct tun_file *tfile = file->private_data; struct net *net = sock_net(&tfile->sk); struct tun_struct *tun; void __user* argp = (void __user*)arg; unsigned int carrier; struct ifreq ifr; kuid_t owner; kgid_t group; int ifindex; int sndbuf; int ret; bool do_notify = false; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; } else { memset(&ifr, 0, sizeof(ifr)); } if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on * TUNSETIFF. */ return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) { return tun_set_queue(file, &ifr); } else if (cmd == SIOCGSKNS) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; return open_related_ns(&net->ns, get_net_ns); } rtnl_lock(); tun = tun_get(tfile); if (cmd == TUNSETIFF) { ret = -EEXIST; if (tun) goto unlock; ifr.ifr_name[IFNAMSIZ-1] = '\0'; ret = tun_set_iff(net, file, &ifr); if (ret) goto unlock; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; goto unlock; } if (cmd == TUNSETIFINDEX) { ret = -EPERM; if (tun) goto unlock; ret = -EFAULT; if (copy_from_user(&ifindex, argp, sizeof(ifindex))) goto unlock; ret = -EINVAL; if (ifindex < 0) goto unlock; ret = 0; tfile->ifindex = ifindex; goto unlock; } ret = -EBADFD; if (!tun) goto unlock; netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd); net = dev_net(tun->dev); ret = 0; switch (cmd) { case TUNGETIFF: tun_get_iff(tun, &ifr); if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; if (!tfile->socket.sk->sk_filter) ifr.ifr_flags |= IFF_NOFILTER; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case TUNSETNOCSUM: /* Disable/Enable checksum */ /* [unimplemented] */ netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n", arg ? "disabled" : "enabled"); break; case TUNSETPERSIST: /* Disable/Enable persist mode. Keep an extra reference to the * module to prevent the module being unprobed. */ if (arg && !(tun->flags & IFF_PERSIST)) { tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); do_notify = true; } if (!arg && (tun->flags & IFF_PERSIST)) { tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); do_notify = true; } netif_info(tun, drv, tun->dev, "persist %s\n", arg ? "enabled" : "disabled"); break; case TUNSETOWNER: /* Set owner of the device */ owner = make_kuid(current_user_ns(), arg); if (!uid_valid(owner)) { ret = -EINVAL; break; } tun->owner = owner; do_notify = true; netif_info(tun, drv, tun->dev, "owner set to %u\n", from_kuid(&init_user_ns, tun->owner)); break; case TUNSETGROUP: /* Set group of the device */ group = make_kgid(current_user_ns(), arg); if (!gid_valid(group)) { ret = -EINVAL; break; } tun->group = group; do_notify = true; netif_info(tun, drv, tun->dev, "group set to %u\n", from_kgid(&init_user_ns, tun->group)); break; case TUNSETLINK: /* Only allow setting the type when the interface is down */ if (tun->dev->flags & IFF_UP) { netif_info(tun, drv, tun->dev, "Linktype set failed because interface is up\n"); ret = -EBUSY; } else { ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, tun->dev); ret = notifier_to_errno(ret); if (ret) { netif_info(tun, drv, tun->dev, "Refused to change device type\n"); break; } tun->dev->type = (int) arg; tun->dev->addr_len = tun_get_addr_len(tun->dev->type); netif_info(tun, drv, tun->dev, "linktype set to %d\n", tun->dev->type); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, tun->dev); } break; case TUNSETDEBUG: tun->msg_enable = (u32)arg; break; case TUNSETOFFLOAD: ret = set_offload(tun, arg); break; case TUNSETTXFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = update_filter(&tun->txflt, (void __user *)arg); break; case SIOCGIFHWADDR: /* Get hw address */ netif_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name); if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case SIOCSIFHWADDR: /* Set hw address */ if (tun->dev->addr_len > sizeof(ifr.ifr_hwaddr)) { ret = -EINVAL; break; } ret = dev_set_mac_address_user(tun->dev, (struct sockaddr_storage *)&ifr.ifr_hwaddr, NULL); break; case TUNGETSNDBUF: sndbuf = tfile->socket.sk->sk_sndbuf; if (copy_to_user(argp, &sndbuf, sizeof(sndbuf))) ret = -EFAULT; break; case TUNSETSNDBUF: if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) { ret = -EFAULT; break; } if (sndbuf <= 0) { ret = -EINVAL; break; } tun->sndbuf = sndbuf; tun_set_sndbuf(tun); break; case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) break; ret = tun_attach_filter(tun); break; case TUNDETACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = 0; tun_detach_filter(tun, tun->numqueues); break; case TUNGETFILTER: ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) break; ret = 0; break; case TUNSETSTEERINGEBPF: ret = tun_set_ebpf(tun, &tun->steering_prog, argp); break; case TUNSETFILTEREBPF: ret = tun_set_ebpf(tun, &tun->filter_prog, argp); break; case TUNSETCARRIER: ret = -EFAULT; if (copy_from_user(&carrier, argp, sizeof(carrier))) goto unlock; ret = tun_net_change_carrier(tun->dev, (bool)carrier); break; case TUNGETDEVNETNS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto unlock; ret = open_related_ns(&net->ns, get_net_ns); break; default: ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp); break; } if (do_notify) netdev_state_change(tun->dev); unlock: rtnl_unlock(); if (tun) tun_put(tun); return ret; } static long tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq)); } #ifdef CONFIG_COMPAT static long tun_chr_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case TUNSETIFF: case TUNGETIFF: case TUNSETTXFILTER: case TUNGETSNDBUF: case TUNSETSNDBUF: case SIOCGIFHWADDR: case SIOCSIFHWADDR: arg = (unsigned long)compat_ptr(arg); break; default: arg = (compat_ulong_t)arg; break; } /* * compat_ifreq is shorter than ifreq, so we must not access beyond * the end of that structure. All fields that are used in this * driver are compatible though, we don't need to convert the * contents. */ return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq)); } #endif /* CONFIG_COMPAT */ static int tun_chr_fasync(int fd, struct file *file, int on) { struct tun_file *tfile = file->private_data; int ret; if (on) { ret = file_f_owner_allocate(file); if (ret) goto out; } if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; if (on) { __f_setown(file, task_pid(current), PIDTYPE_TGID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; ret = 0; out: return ret; } static int tun_chr_open(struct inode *inode, struct file * file) { struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); if (!tfile) return -ENOMEM; if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) { sk_free(&tfile->sk); return -ENOMEM; } mutex_init(&tfile->napi_mutex); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; init_waitqueue_head(&tfile->socket.wq.wait); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid()); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); /* tun groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; return 0; } static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; tun_detach(tfile, true); return 0; } #ifdef CONFIG_PROC_FS static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); rtnl_lock(); tun = tun_get(tfile); if (tun) tun_get_iff(tun, &ifr); rtnl_unlock(); if (tun) tun_put(tun); seq_printf(m, "iff:\t%s\n", ifr.ifr_name); } #endif static const struct file_operations tun_fops = { .owner = THIS_MODULE, .read_iter = tun_chr_read_iter, .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = tun_chr_compat_ioctl, #endif .open = tun_chr_open, .release = tun_chr_close, .fasync = tun_chr_fasync, #ifdef CONFIG_PROC_FS .show_fdinfo = tun_chr_show_fdinfo, #endif }; static struct miscdevice tun_miscdev = { .minor = TUN_MINOR, .name = "tun", .nodename = "net/tun", .fops = &tun_fops, }; /* ethtool interface */ static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { ethtool_link_ksettings_zero_link_mode(cmd, supported); ethtool_link_ksettings_zero_link_mode(cmd, advertising); cmd->base.speed = SPEED_10000; cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_TP; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_DISABLE; } static int tun_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(cmd, &tun->link_ksettings, sizeof(*cmd)); return 0; } static int tun_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(&tun->link_ksettings, cmd, sizeof(*cmd)); return 0; } static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct tun_struct *tun = netdev_priv(dev); strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: strscpy(info->bus_info, "tun", sizeof(info->bus_info)); break; case IFF_TAP: strscpy(info->bus_info, "tap", sizeof(info->bus_info)); break; } } static u32 tun_get_msglevel(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); return tun->msg_enable; } static void tun_set_msglevel(struct net_device *dev, u32 value) { struct tun_struct *tun = netdev_priv(dev); tun->msg_enable = value; } static int tun_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); ec->rx_max_coalesced_frames = tun->rx_batched; return 0; } static int tun_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT) tun->rx_batched = NAPI_POLL_WEIGHT; else tun->rx_batched = ec->rx_max_coalesced_frames; return 0; } static void tun_get_channels(struct net_device *dev, struct ethtool_channels *channels) { struct tun_struct *tun = netdev_priv(dev); channels->combined_count = tun->numqueues; channels->max_combined = tun->flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; } static const struct ethtool_ops tun_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES, .get_drvinfo = tun_get_drvinfo, .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, .get_link = ethtool_op_get_link, .get_channels = tun_get_channels, .get_ts_info = ethtool_op_get_ts_info, .get_coalesce = tun_get_coalesce, .set_coalesce = tun_set_coalesce, .get_link_ksettings = tun_get_link_ksettings, .set_link_ksettings = tun_set_link_ksettings, }; static int tun_queue_resize(struct tun_struct *tun) { struct net_device *dev = tun->dev; struct tun_file *tfile; struct ptr_ring **rings; int n = tun->numqueues + tun->numdisabled; int ret, i; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); rings[i] = &tfile->tx_ring; } list_for_each_entry(tfile, &tun->disabled, next) rings[i++] = &tfile->tx_ring; ret = ptr_ring_resize_multiple_bh(rings, n, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free); kfree(rings); return ret; } static int tun_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tun_struct *tun = netdev_priv(dev); int i; if (dev->rtnl_link_ops != &tun_link_ops) return NOTIFY_DONE; switch (event) { case NETDEV_CHANGE_TX_QUEUE_LEN: if (tun_queue_resize(tun)) return NOTIFY_BAD; break; case NETDEV_UP: for (i = 0; i < tun->numqueues; i++) { struct tun_file *tfile; tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_write_space(tfile->socket.sk); } break; default: break; } return NOTIFY_DONE; } static struct notifier_block tun_notifier_block __read_mostly = { .notifier_call = tun_device_event, }; static int __init tun_init(void) { int ret = 0; pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION); ret = rtnl_link_register(&tun_link_ops); if (ret) { pr_err("Can't register link_ops\n"); goto err_linkops; } ret = misc_register(&tun_miscdev); if (ret) { pr_err("Can't register misc device %d\n", TUN_MINOR); goto err_misc; } ret = register_netdevice_notifier(&tun_notifier_block); if (ret) { pr_err("Can't register netdevice notifier\n"); goto err_notifier; } return 0; err_notifier: misc_deregister(&tun_miscdev); err_misc: rtnl_link_unregister(&tun_link_ops); err_linkops: return ret; } static void __exit tun_cleanup(void) { misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); unregister_netdevice_notifier(&tun_notifier_block); } /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tun_get_socket(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->socket; } EXPORT_SYMBOL_GPL(tun_get_socket); struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->tx_ring; } EXPORT_SYMBOL_GPL(tun_get_tx_ring); module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(TUN_MINOR); MODULE_ALIAS("devname:net/tun"); MODULE_IMPORT_NS("NETDEV_INTERNAL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0 */ /* * USB PHY defines * * These APIs may be used between USB controllers. USB device drivers * (for either host or peripheral roles) don't use these calls; they * continue to use just usb_device and usb_gadget. */ #ifndef __LINUX_USB_PHY_H #define __LINUX_USB_PHY_H #include <linux/extcon.h> #include <linux/notifier.h> #include <linux/usb.h> #include <uapi/linux/usb/charger.h> enum usb_phy_interface { USBPHY_INTERFACE_MODE_UNKNOWN, USBPHY_INTERFACE_MODE_UTMI, USBPHY_INTERFACE_MODE_UTMIW, USBPHY_INTERFACE_MODE_ULPI, USBPHY_INTERFACE_MODE_SERIAL, USBPHY_INTERFACE_MODE_HSIC, }; enum usb_phy_events { USB_EVENT_NONE, /* no events or cable disconnected */ USB_EVENT_VBUS, /* vbus valid event */ USB_EVENT_ID, /* id was grounded */ USB_EVENT_CHARGER, /* usb dedicated charger */ USB_EVENT_ENUMERATED, /* gadget driver enumerated */ }; /* associate a type with PHY */ enum usb_phy_type { USB_PHY_TYPE_UNDEFINED, USB_PHY_TYPE_USB2, USB_PHY_TYPE_USB3, }; /* OTG defines lots of enumeration states before device reset */ enum usb_otg_state { OTG_STATE_UNDEFINED = 0, /* single-role peripheral, and dual-role default-b */ OTG_STATE_B_IDLE, OTG_STATE_B_SRP_INIT, OTG_STATE_B_PERIPHERAL, /* extra dual-role default-b states */ OTG_STATE_B_WAIT_ACON, OTG_STATE_B_HOST, /* dual-role default-a */ OTG_STATE_A_IDLE, OTG_STATE_A_WAIT_VRISE, OTG_STATE_A_WAIT_BCON, OTG_STATE_A_HOST, OTG_STATE_A_SUSPEND, OTG_STATE_A_PERIPHERAL, OTG_STATE_A_WAIT_VFALL, OTG_STATE_A_VBUS_ERR, }; struct usb_phy; struct usb_otg; /* for phys connected thru an ULPI interface, the user must * provide access ops */ struct usb_phy_io_ops { int (*read)(struct usb_phy *x, u32 reg); int (*write)(struct usb_phy *x, u32 val, u32 reg); }; struct usb_charger_current { unsigned int sdp_min; unsigned int sdp_max; unsigned int dcp_min; unsigned int dcp_max; unsigned int cdp_min; unsigned int cdp_max; unsigned int aca_min; unsigned int aca_max; }; struct usb_phy { struct device *dev; const char *label; unsigned int flags; enum usb_phy_type type; enum usb_phy_events last_event; struct usb_otg *otg; struct device *io_dev; struct usb_phy_io_ops *io_ops; void __iomem *io_priv; /* to support extcon device */ struct extcon_dev *edev; struct extcon_dev *id_edev; struct notifier_block vbus_nb; struct notifier_block id_nb; struct notifier_block type_nb; /* Support USB charger */ enum usb_charger_type chg_type; enum usb_charger_state chg_state; struct usb_charger_current chg_cur; struct work_struct chg_work; /* for notification of usb_phy_events */ struct atomic_notifier_head notifier; /* to pass extra port status to the root hub */ u16 port_status; u16 port_change; /* to support controllers that have multiple phys */ struct list_head head; /* initialize/shutdown the phy */ int (*init)(struct usb_phy *x); void (*shutdown)(struct usb_phy *x); /* enable/disable VBUS */ int (*set_vbus)(struct usb_phy *x, int on); /* effective for B devices, ignored for A-peripheral */ int (*set_power)(struct usb_phy *x, unsigned mA); /* Set phy into suspend mode */ int (*set_suspend)(struct usb_phy *x, int suspend); /* * Set wakeup enable for PHY, in that case, the PHY can be * woken up from suspend status due to external events, * like vbus change, dp/dm change and id. */ int (*set_wakeup)(struct usb_phy *x, bool enabled); /* notify phy connect status change */ int (*notify_connect)(struct usb_phy *x, enum usb_device_speed speed); int (*notify_disconnect)(struct usb_phy *x, enum usb_device_speed speed); /* * Charger detection method can be implemented if you need to * manually detect the charger type. */ enum usb_charger_type (*charger_detect)(struct usb_phy *x); }; /* for board-specific init logic */ extern int usb_add_phy(struct usb_phy *, enum usb_phy_type type); extern int usb_add_phy_dev(struct usb_phy *); extern void usb_remove_phy(struct usb_phy *); /* helpers for direct access thru low-level io interface */ static inline int usb_phy_io_read(struct usb_phy *x, u32 reg) { if (x && x->io_ops && x->io_ops->read) return x->io_ops->read(x, reg); return -EINVAL; } static inline int usb_phy_io_write(struct usb_phy *x, u32 val, u32 reg) { if (x && x->io_ops && x->io_ops->write) return x->io_ops->write(x, val, reg); return -EINVAL; } static inline int usb_phy_init(struct usb_phy *x) { if (x && x->init) return x->init(x); return 0; } static inline void usb_phy_shutdown(struct usb_phy *x) { if (x && x->shutdown) x->shutdown(x); } static inline int usb_phy_vbus_on(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, true); } static inline int usb_phy_vbus_off(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, false); } /* for usb host and peripheral controller drivers */ #if IS_ENABLED(CONFIG_USB_PHY) extern struct usb_phy *usb_get_phy(enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index); extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); extern void usb_phy_set_event(struct usb_phy *x, unsigned long event); extern void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA); extern void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max); extern void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state); #else static inline struct usb_phy *usb_get_phy(enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb) { return ERR_PTR(-ENXIO); } static inline void usb_put_phy(struct usb_phy *x) { } static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event) { } static inline void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA) { } static inline void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max) { } static inline void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state) { } #endif static inline int usb_phy_set_power(struct usb_phy *x, unsigned mA) { if (!x) return 0; usb_phy_set_charger_current(x, mA); if (x->set_power) return x->set_power(x, mA); return 0; } /* Context: can sleep */ static inline int usb_phy_set_suspend(struct usb_phy *x, int suspend) { if (x && x->set_suspend != NULL) return x->set_suspend(x, suspend); else return 0; } static inline int usb_phy_set_wakeup(struct usb_phy *x, bool enabled) { if (x && x->set_wakeup) return x->set_wakeup(x, enabled); else return 0; } static inline int usb_phy_notify_connect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_connect) return x->notify_connect(x, speed); else return 0; } static inline int usb_phy_notify_disconnect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_disconnect) return x->notify_disconnect(x, speed); else return 0; } /* notifiers */ static inline int usb_register_notifier(struct usb_phy *x, struct notifier_block *nb) { return atomic_notifier_chain_register(&x->notifier, nb); } static inline void usb_unregister_notifier(struct usb_phy *x, struct notifier_block *nb) { atomic_notifier_chain_unregister(&x->notifier, nb); } static inline const char *usb_phy_type_string(enum usb_phy_type type) { switch (type) { case USB_PHY_TYPE_USB2: return "USB2 PHY"; case USB_PHY_TYPE_USB3: return "USB3 PHY"; default: return "UNKNOWN PHY TYPE"; } } #endif /* __LINUX_USB_PHY_H */
34 13213 7 13372 13372 13207 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2020 SiFive */ #ifndef __ASM_RISCV_VECTOR_H #define __ASM_RISCV_VECTOR_H #include <linux/types.h> #include <uapi/asm-generic/errno.h> #ifdef CONFIG_RISCV_ISA_V #include <linux/stringify.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <asm/ptrace.h> #include <asm/cpufeature.h> #include <asm/csr.h> #include <asm/asm.h> #include <asm/vendorid_list.h> #include <asm/vendor_extensions.h> #include <asm/vendor_extensions/thead.h> #define __riscv_v_vstate_or(_val, TYPE) ({ \ typeof(_val) _res = _val; \ if (has_xtheadvector()) \ _res = (_res & ~SR_VS_THEAD) | SR_VS_##TYPE##_THEAD; \ else \ _res = (_res & ~SR_VS) | SR_VS_##TYPE; \ _res; \ }) #define __riscv_v_vstate_check(_val, TYPE) ({ \ bool _res; \ if (has_xtheadvector()) \ _res = ((_val) & SR_VS_THEAD) == SR_VS_##TYPE##_THEAD; \ else \ _res = ((_val) & SR_VS) == SR_VS_##TYPE; \ _res; \ }) extern unsigned long riscv_v_vsize; int riscv_v_setup_vsize(void); bool insn_is_vector(u32 insn_buf); bool riscv_v_first_use_handler(struct pt_regs *regs); void kernel_vector_begin(void); void kernel_vector_end(void); void get_cpu_vector_context(void); void put_cpu_vector_context(void); void riscv_v_thread_free(struct task_struct *tsk); void __init riscv_v_setup_ctx_cache(void); void riscv_v_thread_alloc(struct task_struct *tsk); void __init update_regset_vector_info(unsigned long size); static inline u32 riscv_v_flags(void) { return READ_ONCE(current->thread.riscv_v_flags); } static __always_inline bool has_vector(void) { return riscv_has_extension_unlikely(RISCV_ISA_EXT_ZVE32X); } static __always_inline bool has_xtheadvector_no_alternatives(void) { if (IS_ENABLED(CONFIG_RISCV_ISA_XTHEADVECTOR)) return riscv_isa_vendor_extension_available(THEAD_VENDOR_ID, XTHEADVECTOR); else return false; } static __always_inline bool has_xtheadvector(void) { if (IS_ENABLED(CONFIG_RISCV_ISA_XTHEADVECTOR)) return riscv_has_vendor_extension_unlikely(THEAD_VENDOR_ID, RISCV_ISA_VENDOR_EXT_XTHEADVECTOR); else return false; } static inline void __riscv_v_vstate_clean(struct pt_regs *regs) { regs->status = __riscv_v_vstate_or(regs->status, CLEAN); } static inline void __riscv_v_vstate_dirty(struct pt_regs *regs) { regs->status = __riscv_v_vstate_or(regs->status, DIRTY); } static inline void riscv_v_vstate_off(struct pt_regs *regs) { regs->status = __riscv_v_vstate_or(regs->status, OFF); } static inline void riscv_v_vstate_on(struct pt_regs *regs) { regs->status = __riscv_v_vstate_or(regs->status, INITIAL); } static inline bool riscv_v_vstate_query(struct pt_regs *regs) { return !__riscv_v_vstate_check(regs->status, OFF); } static __always_inline void riscv_v_enable(void) { if (has_xtheadvector()) csr_set(CSR_SSTATUS, SR_VS_THEAD); else csr_set(CSR_SSTATUS, SR_VS); } static __always_inline void riscv_v_disable(void) { if (has_xtheadvector()) csr_clear(CSR_SSTATUS, SR_VS_THEAD); else csr_clear(CSR_SSTATUS, SR_VS); } static __always_inline bool riscv_v_is_on(void) { return !!(csr_read(CSR_SSTATUS) & SR_VS); } static __always_inline void __vstate_csr_save(struct __riscv_v_ext_state *dest) { asm volatile ( "csrr %0, " __stringify(CSR_VSTART) "\n\t" "csrr %1, " __stringify(CSR_VTYPE) "\n\t" "csrr %2, " __stringify(CSR_VL) "\n\t" : "=r" (dest->vstart), "=r" (dest->vtype), "=r" (dest->vl), "=r" (dest->vcsr) : :); if (has_xtheadvector()) { unsigned long status; /* * CSR_VCSR is defined as * [2:1] - vxrm[1:0] * [0] - vxsat * The earlier vector spec implemented by T-Head uses separate * registers for the same bit-elements, so just combine those * into the existing output field. * * Additionally T-Head cores need FS to be enabled when accessing * the VXRM and VXSAT CSRs, otherwise ending in illegal instructions. * Though the cores do not implement the VXRM and VXSAT fields in the * FCSR CSR that vector-0.7.1 specifies. */ status = csr_read_set(CSR_STATUS, SR_FS_DIRTY); dest->vcsr = csr_read(CSR_VXSAT) | csr_read(CSR_VXRM) << CSR_VXRM_SHIFT; dest->vlenb = riscv_v_vsize / 32; if ((status & SR_FS) != SR_FS_DIRTY) csr_write(CSR_STATUS, status); } else { dest->vcsr = csr_read(CSR_VCSR); dest->vlenb = csr_read(CSR_VLENB); } } static __always_inline void __vstate_csr_restore(struct __riscv_v_ext_state *src) { asm volatile ( ".option push\n\t" ".option arch, +zve32x\n\t" "vsetvl x0, %2, %1\n\t" ".option pop\n\t" "csrw " __stringify(CSR_VSTART) ", %0\n\t" : : "r" (src->vstart), "r" (src->vtype), "r" (src->vl)); if (has_xtheadvector()) { unsigned long status = csr_read(CSR_SSTATUS); /* * Similar to __vstate_csr_save above, restore values for the * separate VXRM and VXSAT CSRs from the vcsr variable. */ status = csr_read_set(CSR_STATUS, SR_FS_DIRTY); csr_write(CSR_VXRM, (src->vcsr >> CSR_VXRM_SHIFT) & CSR_VXRM_MASK); csr_write(CSR_VXSAT, src->vcsr & CSR_VXSAT_MASK); if ((status & SR_FS) != SR_FS_DIRTY) csr_write(CSR_STATUS, status); } else { csr_write(CSR_VCSR, src->vcsr); } } static inline void __riscv_v_vstate_save(struct __riscv_v_ext_state *save_to, void *datap) { unsigned long vl; riscv_v_enable(); __vstate_csr_save(save_to); if (has_xtheadvector()) { asm volatile ( "mv t0, %0\n\t" THEAD_VSETVLI_T4X0E8M8D1 THEAD_VSB_V_V0T0 "add t0, t0, t4\n\t" THEAD_VSB_V_V8T0 "add t0, t0, t4\n\t" THEAD_VSB_V_V16T0 "add t0, t0, t4\n\t" THEAD_VSB_V_V24T0 : : "r" (datap) : "memory", "t0", "t4"); } else { asm volatile ( ".option push\n\t" ".option arch, +zve32x\n\t" "vsetvli %0, x0, e8, m8, ta, ma\n\t" "vse8.v v0, (%1)\n\t" "add %1, %1, %0\n\t" "vse8.v v8, (%1)\n\t" "add %1, %1, %0\n\t" "vse8.v v16, (%1)\n\t" "add %1, %1, %0\n\t" "vse8.v v24, (%1)\n\t" ".option pop\n\t" : "=&r" (vl) : "r" (datap) : "memory"); } riscv_v_disable(); } static inline void __riscv_v_vstate_restore(struct __riscv_v_ext_state *restore_from, void *datap) { unsigned long vl; riscv_v_enable(); if (has_xtheadvector()) { asm volatile ( "mv t0, %0\n\t" THEAD_VSETVLI_T4X0E8M8D1 THEAD_VLB_V_V0T0 "add t0, t0, t4\n\t" THEAD_VLB_V_V8T0 "add t0, t0, t4\n\t" THEAD_VLB_V_V16T0 "add t0, t0, t4\n\t" THEAD_VLB_V_V24T0 : : "r" (datap) : "memory", "t0", "t4"); } else { asm volatile ( ".option push\n\t" ".option arch, +zve32x\n\t" "vsetvli %0, x0, e8, m8, ta, ma\n\t" "vle8.v v0, (%1)\n\t" "add %1, %1, %0\n\t" "vle8.v v8, (%1)\n\t" "add %1, %1, %0\n\t" "vle8.v v16, (%1)\n\t" "add %1, %1, %0\n\t" "vle8.v v24, (%1)\n\t" ".option pop\n\t" : "=&r" (vl) : "r" (datap) : "memory"); } __vstate_csr_restore(restore_from); riscv_v_disable(); } static inline void __riscv_v_vstate_discard(void) { unsigned long vl, vtype_inval = 1UL << (BITS_PER_LONG - 1); riscv_v_enable(); if (has_xtheadvector()) asm volatile (THEAD_VSETVLI_T4X0E8M8D1 : : : "t4"); else asm volatile ( ".option push\n\t" ".option arch, +zve32x\n\t" "vsetvli %0, x0, e8, m8, ta, ma\n\t" ".option pop\n\t": "=&r" (vl)); asm volatile ( ".option push\n\t" ".option arch, +zve32x\n\t" "vmv.v.i v0, -1\n\t" "vmv.v.i v8, -1\n\t" "vmv.v.i v16, -1\n\t" "vmv.v.i v24, -1\n\t" "vsetvl %0, x0, %1\n\t" ".option pop\n\t" : "=&r" (vl) : "r" (vtype_inval)); riscv_v_disable(); } static inline void riscv_v_vstate_discard(struct pt_regs *regs) { if (riscv_v_vstate_query(regs)) { __riscv_v_vstate_discard(); __riscv_v_vstate_dirty(regs); } } static inline void riscv_v_vstate_save(struct __riscv_v_ext_state *vstate, struct pt_regs *regs) { if (__riscv_v_vstate_check(regs->status, DIRTY)) { __riscv_v_vstate_save(vstate, vstate->datap); __riscv_v_vstate_clean(regs); } } static inline void riscv_v_vstate_restore(struct __riscv_v_ext_state *vstate, struct pt_regs *regs) { if (riscv_v_vstate_query(regs)) { __riscv_v_vstate_restore(vstate, vstate->datap); __riscv_v_vstate_clean(regs); } } static inline void riscv_v_vstate_set_restore(struct task_struct *task, struct pt_regs *regs) { if (riscv_v_vstate_query(regs)) { set_tsk_thread_flag(task, TIF_RISCV_V_DEFER_RESTORE); riscv_v_vstate_on(regs); } } #ifdef CONFIG_RISCV_ISA_V_PREEMPTIVE static inline bool riscv_preempt_v_dirty(struct task_struct *task) { return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V_DIRTY); } static inline bool riscv_preempt_v_restore(struct task_struct *task) { return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V_NEED_RESTORE); } static inline void riscv_preempt_v_clear_dirty(struct task_struct *task) { barrier(); task->thread.riscv_v_flags &= ~RISCV_PREEMPT_V_DIRTY; } static inline void riscv_preempt_v_set_restore(struct task_struct *task) { barrier(); task->thread.riscv_v_flags |= RISCV_PREEMPT_V_NEED_RESTORE; } static inline bool riscv_preempt_v_started(struct task_struct *task) { return !!(task->thread.riscv_v_flags & RISCV_PREEMPT_V); } #else /* !CONFIG_RISCV_ISA_V_PREEMPTIVE */ static inline bool riscv_preempt_v_dirty(struct task_struct *task) { return false; } static inline bool riscv_preempt_v_restore(struct task_struct *task) { return false; } static inline bool riscv_preempt_v_started(struct task_struct *task) { return false; } #define riscv_preempt_v_clear_dirty(tsk) do {} while (0) #define riscv_preempt_v_set_restore(tsk) do {} while (0) #endif /* CONFIG_RISCV_ISA_V_PREEMPTIVE */ static inline void __switch_to_vector(struct task_struct *prev, struct task_struct *next) { struct pt_regs *regs; if (riscv_preempt_v_started(prev)) { if (riscv_v_is_on()) { WARN_ON(prev->thread.riscv_v_flags & RISCV_V_CTX_DEPTH_MASK); riscv_v_disable(); prev->thread.riscv_v_flags |= RISCV_PREEMPT_V_IN_SCHEDULE; } if (riscv_preempt_v_dirty(prev)) { __riscv_v_vstate_save(&prev->thread.kernel_vstate, prev->thread.kernel_vstate.datap); riscv_preempt_v_clear_dirty(prev); } } else { regs = task_pt_regs(prev); riscv_v_vstate_save(&prev->thread.vstate, regs); } if (riscv_preempt_v_started(next)) { if (next->thread.riscv_v_flags & RISCV_PREEMPT_V_IN_SCHEDULE) { next->thread.riscv_v_flags &= ~RISCV_PREEMPT_V_IN_SCHEDULE; riscv_v_enable(); } else { riscv_preempt_v_set_restore(next); } } else { riscv_v_vstate_set_restore(next, task_pt_regs(next)); } } void riscv_v_vstate_ctrl_init(struct task_struct *tsk); bool riscv_v_vstate_ctrl_user_allowed(void); #else /* ! CONFIG_RISCV_ISA_V */ struct pt_regs; static inline int riscv_v_setup_vsize(void) { return -EOPNOTSUPP; } static __always_inline bool has_vector(void) { return false; } static __always_inline bool insn_is_vector(u32 insn_buf) { return false; } static __always_inline bool has_xtheadvector_no_alternatives(void) { return false; } static __always_inline bool has_xtheadvector(void) { return false; } static inline bool riscv_v_first_use_handler(struct pt_regs *regs) { return false; } static inline bool riscv_v_vstate_query(struct pt_regs *regs) { return false; } static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #define riscv_v_vsize (0) #define riscv_v_vstate_discard(regs) do {} while (0) #define riscv_v_vstate_save(vstate, regs) do {} while (0) #define riscv_v_vstate_restore(vstate, regs) do {} while (0) #define __switch_to_vector(__prev, __next) do {} while (0) #define riscv_v_vstate_off(regs) do {} while (0) #define riscv_v_vstate_on(regs) do {} while (0) #define riscv_v_thread_free(tsk) do {} while (0) #define riscv_v_setup_ctx_cache() do {} while (0) #define riscv_v_thread_alloc(tsk) do {} while (0) #endif /* CONFIG_RISCV_ISA_V */ /* * Return the implementation's vlen value. * * riscv_v_vsize contains the value of "32 vector registers with vlenb length" * so rebuild the vlen value in bits from it. */ static inline int riscv_vector_vlen(void) { return riscv_v_vsize / 32 * 8; } #endif /* ! __ASM_RISCV_VECTOR_H */
337 26 341 4 69 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0-or-later /* * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IP/TCP/UDP checksumming routines * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Tom May, <ftom@netcom.com> * Andreas Schwab, <schwab@issan.informatik.uni-dortmund.de> * Lots of code moved from tcp.c and ip.c; see those files * for more names. * * 03/02/96 Jes Sorensen, Andreas Schwab, Roman Hodek: * Fixed some nasty bugs, causing some horrible crashes. * A: At some points, the sum (%0) was used as * length-counter instead of the length counter * (%1). Thanks to Roman Hodek for pointing this out. * B: GCC seems to mess up if one uses too many * data-registers to hold input values and one tries to * specify d0 and d1 as scratch registers. Letting gcc * choose these registers itself solves the problem. */ /* Revised by Kenneth Albanowski for m68knommu. Basic problem: unaligned access kills, so most of the assembly has to go. */ #include <linux/export.h> #include <net/checksum.h> #include <asm/byteorder.h> #ifndef do_csum static unsigned int do_csum(const unsigned char *buff, int len) { int odd; unsigned int result = 0; if (len <= 0) goto out; odd = 1 & (unsigned long) buff; if (odd) { #ifdef __LITTLE_ENDIAN result += (*buff << 8); #else result = *buff; #endif len--; buff++; } if (len >= 2) { if (2 & (unsigned long) buff) { result += *(unsigned short *) buff; len -= 2; buff += 2; } if (len >= 4) { const unsigned char *end = buff + ((unsigned)len & ~3); unsigned int carry = 0; do { unsigned int w = *(unsigned int *) buff; buff += 4; result += carry; result += w; carry = (w > result); } while (buff < end); result += carry; result = (result & 0xffff) + (result >> 16); } if (len & 2) { result += *(unsigned short *) buff; buff += 2; } } if (len & 1) #ifdef __LITTLE_ENDIAN result += *buff; #else result += (*buff << 8); #endif result = csum_from32to16(result); if (odd) result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); out: return result; } #endif #ifndef ip_fast_csum /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. */ __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { return (__force __sum16)~do_csum(iph, ihl*4); } EXPORT_SYMBOL(ip_fast_csum); #endif /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * * returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic * * this function must be called with even lengths, except * for the last fragment, which may be odd * * it's best to have buff aligned on a 32-bit boundary */ __wsum csum_partial(const void *buff, int len, __wsum wsum) { unsigned int sum = (__force unsigned int)wsum; unsigned int result = do_csum(buff, len); /* add in old sum, and carry.. */ result += sum; if (sum > result) result += 1; return (__force __wsum)result; } EXPORT_SYMBOL(csum_partial); /* * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ __sum16 ip_compute_csum(const void *buff, int len) { return (__force __sum16)~do_csum(buff, len); } EXPORT_SYMBOL(ip_compute_csum); #ifndef csum_tcpudp_nofold static inline u32 from64to32(u64 x) { /* add up 32-bit and 32-bit for 32+c bit */ x = (x & 0xffffffff) + (x >> 32); /* add up carry.. */ x = (x & 0xffffffff) + (x >> 32); return (u32)x; } __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { unsigned long long s = (__force u32)sum; s += (__force u32)saddr; s += (__force u32)daddr; #ifdef __BIG_ENDIAN s += proto + len; #else s += (proto + len) << 8; #endif return (__force __wsum)from64to32(s); } EXPORT_SYMBOL(csum_tcpudp_nofold); #endif
86 13 14 13 85 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NAMEI_H #define _LINUX_NAMEI_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/path.h> #include <linux/fcntl.h> #include <linux/errno.h> #include <linux/fs_struct.h> enum { MAX_NESTED_LINKS = 8 }; #define MAXSYMLINKS 40 /* * Type of the last component on LOOKUP_PARENT */ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* pathwalk mode */ #define LOOKUP_FOLLOW BIT(0) /* follow links at the end */ #define LOOKUP_DIRECTORY BIT(1) /* require a directory */ #define LOOKUP_AUTOMOUNT BIT(2) /* force terminal automount */ #define LOOKUP_EMPTY BIT(3) /* accept empty path [user_... only] */ #define LOOKUP_LINKAT_EMPTY BIT(4) /* Linkat request with empty path. */ #define LOOKUP_DOWN BIT(5) /* follow mounts in the starting point */ #define LOOKUP_MOUNTPOINT BIT(6) /* follow mounts in the end */ #define LOOKUP_REVAL BIT(7) /* tell ->d_revalidate() to trust no cache */ #define LOOKUP_RCU BIT(8) /* RCU pathwalk mode; semi-internal */ #define LOOKUP_CACHED BIT(9) /* Only do cached lookup */ #define LOOKUP_PARENT BIT(10) /* Looking up final parent in path */ /* 5 spare bits for pathwalk */ /* These tell filesystem methods that we are dealing with the final component... */ #define LOOKUP_OPEN BIT(16) /* ... in open */ #define LOOKUP_CREATE BIT(17) /* ... in object creation */ #define LOOKUP_EXCL BIT(18) /* ... in target must not exist */ #define LOOKUP_RENAME_TARGET BIT(19) /* ... in destination of rename() */ /* 4 spare bits for intent */ /* Scoping flags for lookup. */ #define LOOKUP_NO_SYMLINKS BIT(24) /* No symlink crossing. */ #define LOOKUP_NO_MAGICLINKS BIT(25) /* No nd_jump_link() crossing. */ #define LOOKUP_NO_XDEV BIT(26) /* No mountpoint crossing. */ #define LOOKUP_BENEATH BIT(27) /* No escaping from starting point. */ #define LOOKUP_IN_ROOT BIT(28) /* Treat dirfd as fs root. */ /* LOOKUP_* flags which do scope-related checks based on the dirfd. */ #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT) /* 3 spare bits for scoping */ extern int path_pts(struct path *path); extern int user_path_at(int, const char __user *, unsigned, struct path *); struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); struct dentry *kern_path_parent(const char *name, struct path *parent); extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int); extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int); extern void end_creating_path(const struct path *, struct dentry *); extern struct dentry *start_removing_path(const char *, struct path *); extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *); static inline void end_removing_path(const struct path *path , struct dentry *dentry) { end_creating_path(path, dentry); } int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); extern struct dentry *try_lookup_noperm(struct qstr *, struct dentry *); extern struct dentry *lookup_noperm(struct qstr *, struct dentry *); extern struct dentry *lookup_noperm_unlocked(struct qstr *, struct dentry *); extern struct dentry *lookup_noperm_positive_unlocked(struct qstr *, struct dentry *); struct dentry *lookup_one(struct mnt_idmap *, struct qstr *, struct dentry *); struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); struct dentry *start_creating_killable(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); struct dentry *start_removing_killable(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); struct dentry *start_creating_dentry(struct dentry *parent, struct dentry *child); struct dentry *start_removing_dentry(struct dentry *parent, struct dentry *child); /* end_creating - finish action started with start_creating * @child: dentry returned by start_creating() or vfs_mkdir() * * Unlock and release the child. This can be called after * start_creating() whether that function succeeded or not, * but it is not needed on failure. * * If vfs_mkdir() was called then the value returned from that function * should be given for @child rather than the original dentry, as vfs_mkdir() * may have provided a new dentry. * * * If vfs_mkdir() was not called, then @child will be a valid dentry and * @parent will be ignored. */ static inline void end_creating(struct dentry *child) { end_dirop(child); } /* end_creating_keep - finish action started with start_creating() and return result * @child: dentry returned by start_creating() or vfs_mkdir() * * Unlock and return the child. This can be called after * start_creating() whether that function succeeded or not, * but it is not needed on failure. * * If vfs_mkdir() was called then the value returned from that function * should be given for @child rather than the original dentry, as vfs_mkdir() * may have provided a new dentry. * * Returns: @child, which may be a dentry or an error. * */ static inline struct dentry *end_creating_keep(struct dentry *child) { if (!IS_ERR(child)) dget(child); end_dirop(child); return child; } /** * end_removing - finish action started with start_removing * @child: dentry returned by start_removing() * @parent: dentry given to start_removing() * * Unlock and release the child. * * This is identical to end_dirop(). It can be passed the result of * start_removing() whether that was successful or not, but it not needed * if start_removing() failed. */ static inline void end_removing(struct dentry *child) { end_dirop(child); } extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); int start_renaming(struct renamedata *rd, int lookup_flags, struct qstr *old_last, struct qstr *new_last); int start_renaming_dentry(struct renamedata *rd, int lookup_flags, struct dentry *old_dentry, struct qstr *new_last); int start_renaming_two_dentries(struct renamedata *rd, struct dentry *old_dentry, struct dentry *new_dentry); void end_renaming(struct renamedata *rd); /** * mode_strip_umask - handle vfs umask stripping * @dir: parent directory of the new inode * @mode: mode of the new inode to be created in @dir * * In most filesystems, umask stripping depends on whether or not the * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask * stripping is done directly in here. If the filesystem does support POSIX * ACLs umask stripping is deferred until the filesystem calls * posix_acl_create(). * * Some filesystems (like NFSv4) also want to avoid umask stripping by the * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK * to get this effect without declaring that they support POSIX ACLs. * * Returns: mode */ static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode) { if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK)) mode &= ~current_umask(); return mode; } extern int __must_check nd_jump_link(const struct path *path); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) { ((char *) name)[min(len, maxlen)] = '\0'; } /** * retry_estale - determine whether the caller should retry an operation * @error: the error that would currently be returned * @flags: flags being used for next lookup attempt * * Check to see if the error code was -ESTALE, and then determine whether * to retry the call based on whether "flags" already has LOOKUP_REVAL set. * * Returns true if the caller should try the operation again. */ static inline bool retry_estale(const long error, const unsigned int flags) { return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL)); } #endif /* _LINUX_NAMEI_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 /* SPDX-License-Identifier: GPL-2.0-or-later */ /** * eCryptfs: Linux filesystem encryption layer * Kernel declarations. * * Copyright (C) 1997-2003 Erez Zadok * Copyright (C) 2001-2003 Stony Brook University * Copyright (C) 2004-2008 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> * Trevor S. Highland <trevor.highland@gmail.com> * Tyler Hicks <code@tyhicks.com> */ #ifndef ECRYPTFS_KERNEL_H #define ECRYPTFS_KERNEL_H #include <crypto/md5.h> #include <crypto/skcipher.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/fs_stack.h> #include <linux/namei.h> #include <linux/scatterlist.h> #include <linux/hash.h> #include <linux/nsproxy.h> #include <linux/backing-dev.h> #include <linux/ecryptfs.h> #define ECRYPTFS_DEFAULT_IV_BYTES 16 #define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 #define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192 #define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32 #define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ #define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3) #define ECRYPTFS_DEFAULT_NUM_USERS 4 #define ECRYPTFS_MAX_NUM_USERS 32768 #define ECRYPTFS_XATTR_NAME "user.ecryptfs" void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); static inline void ecryptfs_to_hex(char *dst, char *src, size_t src_size) { char *end = bin2hex(dst, src, src_size); *end = '\0'; } extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); struct ecryptfs_key_record { unsigned char type; size_t enc_key_size; unsigned char sig[ECRYPTFS_SIG_SIZE]; unsigned char enc_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; }; struct ecryptfs_auth_tok_list { struct ecryptfs_auth_tok *auth_tok; struct list_head list; }; struct ecryptfs_crypt_stat; struct ecryptfs_mount_crypt_stat; struct ecryptfs_page_crypt_context { struct page *page; #define ECRYPTFS_PREPARE_COMMIT_MODE 0 #define ECRYPTFS_WRITEPAGE_MODE 1 unsigned int mode; union { struct file *lower_file; struct writeback_control *wbc; } param; }; #if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) static inline struct ecryptfs_auth_tok * ecryptfs_get_encrypted_key_payload_data(struct key *key) { struct encrypted_key_payload *payload; if (key->type != &key_type_encrypted) return NULL; payload = key->payload.data[0]; if (!payload) return ERR_PTR(-EKEYREVOKED); return (struct ecryptfs_auth_tok *)payload->payload_data; } static inline struct key *ecryptfs_get_encrypted_key(char *sig) { return request_key(&key_type_encrypted, sig, NULL); } #else static inline struct ecryptfs_auth_tok * ecryptfs_get_encrypted_key_payload_data(struct key *key) { return NULL; } static inline struct key *ecryptfs_get_encrypted_key(char *sig) { return ERR_PTR(-ENOKEY); } #endif /* CONFIG_ENCRYPTED_KEYS */ static inline struct ecryptfs_auth_tok * ecryptfs_get_key_payload_data(struct key *key) { struct ecryptfs_auth_tok *auth_tok; struct user_key_payload *ukp; auth_tok = ecryptfs_get_encrypted_key_payload_data(key); if (auth_tok) return auth_tok; ukp = user_key_payload_locked(key); if (!ukp) return ERR_PTR(-EKEYREVOKED); return (struct ecryptfs_auth_tok *)ukp->data; } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 #define ECRYPTFS_MAX_CIPHER_NAME_SIZE 31 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64 #define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */ #define ECRYPTFS_SALT_BYTES 2 #define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5 #define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */ #define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64)) #define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \ + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES) #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED #define ECRYPTFS_TAG_64_PACKET_TYPE 0x40 #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 #define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename * as dentry name */ #define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in * metadata */ #define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as * dentry name */ #define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as * metadata */ #define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */ #define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to * ecryptfs_parse_packet_length() and * ecryptfs_write_packet_length() */ /* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >= * ECRYPTFS_MAX_IV_BYTES */ #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 #define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ #define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \ + ECRYPTFS_SIG_SIZE + 1 + 1) #define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \ + ECRYPTFS_SIG_SIZE + 1 + 1) #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED." #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23 #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED." #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24 #define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) #ifdef CONFIG_ECRYPT_FS_MESSAGING # define ECRYPTFS_VERSIONING_MASK_MESSAGING (ECRYPTFS_VERSIONING_DEVMISC \ | ECRYPTFS_VERSIONING_PUBKEY) #else # define ECRYPTFS_VERSIONING_MASK_MESSAGING 0 #endif #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ | ECRYPTFS_VERSIONING_XATTR \ | ECRYPTFS_VERSIONING_MULTKEY \ | ECRYPTFS_VERSIONING_MASK_MESSAGING \ | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) struct ecryptfs_key_sig { struct list_head crypt_stat_list; char keysig[ECRYPTFS_SIG_SIZE_HEX + 1]; }; struct ecryptfs_filename { struct list_head crypt_stat_list; #define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001 u32 flags; u32 seq_no; char *filename; char *encrypted_filename; size_t filename_size; size_t encrypted_filename_size; char fnek_sig[ECRYPTFS_SIG_SIZE_HEX]; char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1]; }; /** * This is the primary struct associated with each encrypted file. * * TODO: cache align/pack? */ struct ecryptfs_crypt_stat { #define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 #define ECRYPTFS_POLICY_APPLIED 0x00000002 #define ECRYPTFS_ENCRYPTED 0x00000004 #define ECRYPTFS_SECURITY_WARNING 0x00000008 #define ECRYPTFS_ENABLE_HMAC 0x00000010 #define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000020 #define ECRYPTFS_KEY_VALID 0x00000040 #define ECRYPTFS_METADATA_IN_XATTR 0x00000080 #define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000100 #define ECRYPTFS_KEY_SET 0x00000200 #define ECRYPTFS_ENCRYPT_FILENAMES 0x00000400 #define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00000800 #define ECRYPTFS_ENCFN_USE_FEK 0x00001000 #define ECRYPTFS_UNLINK_SIGS 0x00002000 #define ECRYPTFS_I_SIZE_INITIALIZED 0x00004000 u32 flags; unsigned int file_version; size_t iv_bytes; size_t metadata_size; size_t extent_size; /* Data extent size; default is 4096 */ size_t key_size; size_t extent_shift; unsigned int extent_mask; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct crypto_skcipher *tfm; unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; struct list_head keysig_list; struct mutex keysig_list_mutex; struct mutex cs_tfm_mutex; struct mutex cs_mutex; }; /* inode private data. */ struct ecryptfs_inode_info { struct inode vfs_inode; struct inode *wii_inode; struct mutex lower_file_mutex; atomic_t lower_file_count; struct file *lower_file; struct ecryptfs_crypt_stat crypt_stat; }; /** * ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint * @flags: Status flags * @mount_crypt_stat_list: These auth_toks hang off the mount-wide * cryptographic context. Every time a new * inode comes into existence, eCryptfs copies * the auth_toks on that list to the set of * auth_toks on the inode's crypt_stat * @global_auth_tok_key: The key from the user's keyring for the sig * @global_auth_tok: The key contents * @sig: The key identifier * * ecryptfs_global_auth_tok structs refer to authentication token keys * in the user keyring that apply to newly created files. A list of * these objects hangs off of the mount_crypt_stat struct for any * given eCryptfs mount. This struct maintains a reference to both the * key contents and the key itself so that the key can be put on * unmount. */ struct ecryptfs_global_auth_tok { #define ECRYPTFS_AUTH_TOK_INVALID 0x00000001 #define ECRYPTFS_AUTH_TOK_FNEK 0x00000002 u32 flags; struct list_head mount_crypt_stat_list; struct key *global_auth_tok_key; unsigned char sig[ECRYPTFS_SIG_SIZE_HEX + 1]; }; /** * ecryptfs_key_tfm - Persistent key tfm * @key_tfm: crypto API handle to the key * @key_size: Key size in bytes * @key_tfm_mutex: Mutex to ensure only one operation in eCryptfs is * using the persistent TFM at any point in time * @key_tfm_list: Handle to hang this off the module-wide TFM list * @cipher_name: String name for the cipher for this TFM * * Typically, eCryptfs will use the same ciphers repeatedly throughout * the course of its operations. In order to avoid unnecessarily * destroying and initializing the same cipher repeatedly, eCryptfs * keeps a list of crypto API contexts around to use when needed. */ struct ecryptfs_key_tfm { struct crypto_skcipher *key_tfm; size_t key_size; struct mutex key_tfm_mutex; struct list_head key_tfm_list; unsigned char cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; }; extern struct mutex key_tfm_list_mutex; /** * This struct is to enable a mount-wide passphrase/salt combo. This * is more or less a stopgap to provide similar functionality to other * crypto filesystems like EncFS or CFS until full policy support is * implemented in eCryptfs. */ struct ecryptfs_mount_crypt_stat { /* Pointers to memory we do not own, do not free these */ #define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001 #define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 #define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 #define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 #define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 #define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY 0x00000080 u32 flags; struct list_head global_auth_tok_list; struct mutex global_auth_tok_list_mutex; size_t global_default_cipher_key_size; size_t global_default_fn_cipher_key_bytes; unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char global_default_fn_cipher_name[ ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1]; }; /* superblock private data. */ struct ecryptfs_sb_info { struct super_block *wsi_sb; struct vfsmount *lower_mnt; struct ecryptfs_mount_crypt_stat mount_crypt_stat; }; /* file private data. */ struct ecryptfs_file_info { struct file *wfi_file; struct ecryptfs_crypt_stat *crypt_stat; }; /* auth_tok <=> encrypted_session_key mappings */ struct ecryptfs_auth_tok_list_item { unsigned char encrypted_session_key[ECRYPTFS_MAX_KEY_BYTES]; struct list_head list; struct ecryptfs_auth_tok auth_tok; }; struct ecryptfs_message { /* Can never be greater than ecryptfs_message_buf_len */ /* Used to find the parent msg_ctx */ /* Inherits from msg_ctx->index */ u32 index; u32 data_len; u8 data[]; }; struct ecryptfs_msg_ctx { #define ECRYPTFS_MSG_CTX_STATE_FREE 0x01 #define ECRYPTFS_MSG_CTX_STATE_PENDING 0x02 #define ECRYPTFS_MSG_CTX_STATE_DONE 0x03 #define ECRYPTFS_MSG_CTX_STATE_NO_REPLY 0x04 u8 state; #define ECRYPTFS_MSG_HELO 100 #define ECRYPTFS_MSG_QUIT 101 #define ECRYPTFS_MSG_REQUEST 102 #define ECRYPTFS_MSG_RESPONSE 103 u8 type; u32 index; /* Counter converts to a sequence number. Each message sent * out for which we expect a response has an associated * sequence number. The response must have the same sequence * number as the counter for the msg_stc for the message to be * valid. */ u32 counter; size_t msg_size; struct ecryptfs_message *msg; struct task_struct *task; struct list_head node; struct list_head daemon_out_list; struct mutex mux; }; struct ecryptfs_daemon { #define ECRYPTFS_DAEMON_IN_READ 0x00000001 #define ECRYPTFS_DAEMON_IN_POLL 0x00000002 #define ECRYPTFS_DAEMON_ZOMBIE 0x00000004 #define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008 u32 flags; u32 num_queued_msg_ctx; struct file *file; struct mutex mux; struct list_head msg_ctx_out_queue; wait_queue_head_t wait; struct hlist_node euid_chain; }; #ifdef CONFIG_ECRYPT_FS_MESSAGING extern struct mutex ecryptfs_daemon_hash_mux; #endif static inline size_t ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) return 0; return crypt_stat->metadata_size; } static inline struct ecryptfs_file_info * ecryptfs_file_to_private(struct file *file) { return file->private_data; } static inline void ecryptfs_set_file_private(struct file *file, struct ecryptfs_file_info *file_info) { file->private_data = file_info; } static inline struct file *ecryptfs_file_to_lower(struct file *file) { return ((struct ecryptfs_file_info *)file->private_data)->wfi_file; } static inline void ecryptfs_set_file_lower(struct file *file, struct file *lower_file) { ((struct ecryptfs_file_info *)file->private_data)->wfi_file = lower_file; } static inline struct ecryptfs_inode_info * ecryptfs_inode_to_private(struct inode *inode) { return container_of(inode, struct ecryptfs_inode_info, vfs_inode); } static inline struct inode *ecryptfs_inode_to_lower(struct inode *inode) { return ecryptfs_inode_to_private(inode)->wii_inode; } static inline void ecryptfs_set_inode_lower(struct inode *inode, struct inode *lower_inode) { ecryptfs_inode_to_private(inode)->wii_inode = lower_inode; } static inline struct ecryptfs_sb_info * ecryptfs_superblock_to_private(struct super_block *sb) { return (struct ecryptfs_sb_info *)sb->s_fs_info; } static inline void ecryptfs_set_superblock_private(struct super_block *sb, struct ecryptfs_sb_info *sb_info) { sb->s_fs_info = sb_info; } static inline struct super_block * ecryptfs_superblock_to_lower(struct super_block *sb) { return ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb; } static inline void ecryptfs_set_superblock_lower(struct super_block *sb, struct super_block *lower_sb) { ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb; } static inline void ecryptfs_set_dentry_lower(struct dentry *dentry, struct dentry *lower_dentry) { dentry->d_fsdata = lower_dentry; } static inline struct dentry * ecryptfs_dentry_to_lower(struct dentry *dentry) { return dentry->d_fsdata; } static inline struct path ecryptfs_lower_path(struct dentry *dentry) { return (struct path){ .mnt = ecryptfs_superblock_to_private(dentry->d_sb)->lower_mnt, .dentry = ecryptfs_dentry_to_lower(dentry) }; } #define ecryptfs_printk(type, fmt, arg...) \ __ecryptfs_printk(type "%s: " fmt, __func__, ## arg) __printf(1, 2) void __ecryptfs_printk(const char *fmt, ...); extern const struct file_operations ecryptfs_main_fops; extern const struct file_operations ecryptfs_dir_fops; extern const struct inode_operations ecryptfs_main_iops; extern const struct inode_operations ecryptfs_dir_iops; extern const struct inode_operations ecryptfs_symlink_iops; extern const struct super_operations ecryptfs_sops; extern const struct dentry_operations ecryptfs_dops; extern const struct address_space_operations ecryptfs_aops; extern int ecryptfs_verbosity; extern unsigned int ecryptfs_message_buf_len; extern signed long ecryptfs_message_wait_timeout; extern unsigned int ecryptfs_number_of_users; extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache; extern struct kmem_cache *ecryptfs_file_info_cache; extern struct kmem_cache *ecryptfs_inode_info_cache; extern struct kmem_cache *ecryptfs_sb_info_cache; extern struct kmem_cache *ecryptfs_header_cache; extern struct kmem_cache *ecryptfs_xattr_cache; extern struct kmem_cache *ecryptfs_key_record_cache; extern struct kmem_cache *ecryptfs_key_sig_cache; extern struct kmem_cache *ecryptfs_global_auth_tok_cache; extern struct kmem_cache *ecryptfs_key_tfm_cache; struct inode *ecryptfs_get_inode(struct inode *lower_inode, struct super_block *sb); void ecryptfs_i_size_init(const char *page_virt, struct inode *inode); int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, struct inode *ecryptfs_inode); int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, size_t *decrypted_name_size, struct super_block *sb, const char *name, size_t name_size); int ecryptfs_fill_zeros(struct file *file, loff_t new_length); int ecryptfs_encrypt_and_encode_filename( char **encoded_name, size_t *encoded_name_size, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, const char *name, size_t name_size); void ecryptfs_dump_hex(char *data, int bytes); int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_rotate_iv(unsigned char *iv); void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); void ecryptfs_destroy_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat); int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); int ecryptfs_encrypt_page(struct folio *folio); int ecryptfs_decrypt_page(struct folio *folio); int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, struct inode *ecryptfs_inode); int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); int ecryptfs_new_file_context(struct inode *ecryptfs_inode); void ecryptfs_write_crypt_stat_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, size_t *written); int ecryptfs_read_and_validate_header_region(struct inode *inode); int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, struct inode *inode); u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes); int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_generate_key_packet_set(char *dest_base, struct ecryptfs_crypt_stat *crypt_stat, struct dentry *ecryptfs_dentry, size_t *len, size_t max); int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, unsigned char *src, struct dentry *ecryptfs_dentry); int ecryptfs_truncate(struct dentry *dentry, loff_t new_length); ssize_t ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode, const char *name, void *value, size_t size); int ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); #ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_process_response(struct ecryptfs_daemon *daemon, struct ecryptfs_message *msg, u32 seq); int ecryptfs_send_message(char *data, int data_len, struct ecryptfs_msg_ctx **msg_ctx); int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, struct ecryptfs_message **emsg); int ecryptfs_init_messaging(void); void ecryptfs_release_messaging(void); #else static inline int ecryptfs_init_messaging(void) { return 0; } static inline void ecryptfs_release_messaging(void) { } static inline int ecryptfs_send_message(char *data, int data_len, struct ecryptfs_msg_ctx **msg_ctx) { return -ENOTCONN; } static inline int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, struct ecryptfs_message **emsg) { return -ENOMSG; } #endif void ecryptfs_write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat, size_t *written); int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig); int ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig, u32 global_auth_tok_flags); int ecryptfs_get_global_auth_tok_for_sig( struct ecryptfs_global_auth_tok **global_auth_tok, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig); int ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name, size_t key_size); int ecryptfs_init_crypto(void); int ecryptfs_destroy_crypto(void); int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm); int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm, struct mutex **tfm_mutex, char *cipher_name); int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, struct ecryptfs_auth_tok **auth_tok, char *sig); int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size); int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, struct folio *folio_for_lower, size_t offset_in_page, size_t size); int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size); int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode); int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs, pgoff_t page_index, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode); int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, size_t *length_size); int ecryptfs_write_packet_length(char *dest, size_t size, size_t *packet_size_length); #ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_init_ecryptfs_miscdev(void); void ecryptfs_destroy_ecryptfs_miscdev(void); int ecryptfs_send_miscdev(char *data, size_t data_size, struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, u16 msg_flags, struct ecryptfs_daemon *daemon); void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); int ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file); int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon); #endif int ecryptfs_init_kthread(void); void ecryptfs_destroy_kthread(void); int ecryptfs_privileged_open(struct file **lower_file, struct dentry *lower_dentry, struct vfsmount *lower_mnt, const struct cred *cred); int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode); void ecryptfs_put_lower_file(struct inode *inode); int ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, size_t *packet_size, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *filename, size_t filename_size); int ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, size_t *packet_size, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *data, size_t max_packet_size); int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, struct ecryptfs_mount_crypt_stat *mount_crypt_stat); void ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, loff_t offset); extern const struct xattr_handler * const ecryptfs_xattr_handlers[]; #endif /* #ifndef ECRYPTFS_KERNEL_H */
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Generic internet FLOW. * */ #ifndef _NET_FLOW_H #define _NET_FLOW_H #include <linux/in6.h> #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/uidgid.h> #include <net/inet_dscp.h> struct flow_keys; /* * ifindex generation is per-net namespace, and loopback is * always the 1st device in ns (see net_dev_init), thus any * loopback device should get ifindex 1 */ #define LOOPBACK_IFINDEX 1 struct flowi_tunnel { __be64 tun_id; }; struct flowi_common { int flowic_oif; int flowic_iif; int flowic_l3mdev; __u32 flowic_mark; dscp_t flowic_dscp; __u8 flowic_scope; __u8 flowic_proto; __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_L3MDEV_OIF 0x04 #define FLOWI_FLAG_ANY_SPORT 0x08 __u32 flowic_secid; kuid_t flowic_uid; __u32 flowic_multipath_hash; struct flowi_tunnel flowic_tun_key; }; union flowi_uli { struct { __be16 dport; __be16 sport; } ports; struct { __u8 type; __u8 code; } icmpt; __be32 gre_key; struct { __u8 type; } mht; }; struct flowi4 { struct flowi_common __fl_common; #define flowi4_oif __fl_common.flowic_oif #define flowi4_iif __fl_common.flowic_iif #define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark #define flowi4_dscp __fl_common.flowic_dscp #define flowi4_scope __fl_common.flowic_scope #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid #define flowi4_multipath_hash __fl_common.flowic_multipath_hash /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; __be32 daddr; union flowi_uli uli; #define fl4_sport uli.ports.sport #define fl4_dport uli.ports.dport #define fl4_icmp_type uli.icmpt.type #define fl4_icmp_code uli.icmpt.code #define fl4_mh_type uli.mht.type #define fl4_gre_key uli.gre_key } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline void flowi4_init_output(struct flowi4 *fl4, int oif, __u32 mark, __u8 tos, __u8 scope, __u8 proto, __u8 flags, __be32 daddr, __be32 saddr, __be16 dport, __be16 sport, kuid_t uid) { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_scope = scope; fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = sport; fl4->flowi4_multipath_hash = 0; } /* Reset some input parameters after previous lookup */ static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __be32 daddr, __be32 saddr) { fl4->flowi4_oif = oif; fl4->daddr = daddr; fl4->saddr = saddr; } struct flowi6 { struct flowi_common __fl_common; #define flowi6_oif __fl_common.flowic_oif #define flowi6_iif __fl_common.flowic_iif #define flowi6_l3mdev __fl_common.flowic_l3mdev #define flowi6_mark __fl_common.flowic_mark #define flowi6_scope __fl_common.flowic_scope #define flowi6_proto __fl_common.flowic_proto #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_dscp is encoded in flowlabel, too. */ __be32 flowlabel; union flowi_uli uli; #define fl6_sport uli.ports.sport #define fl6_dport uli.ports.dport #define fl6_icmp_type uli.icmpt.type #define fl6_icmp_code uli.icmpt.code #define fl6_mh_type uli.mht.type #define fl6_gre_key uli.gre_key __u32 mp_hash; } __attribute__((__aligned__(BITS_PER_LONG/8))); struct flowi { union { struct flowi_common __fl_common; struct flowi4 ip4; struct flowi6 ip6; } u; #define flowi_oif u.__fl_common.flowic_oif #define flowi_iif u.__fl_common.flowic_iif #define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark #define flowi_dscp u.__fl_common.flowic_dscp #define flowi_scope u.__fl_common.flowic_scope #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) { return container_of(fl4, struct flowi, u.ip4); } static inline struct flowi_common *flowi4_to_flowi_common(struct flowi4 *fl4) { return &(fl4->__fl_common); } static inline struct flowi *flowi6_to_flowi(struct flowi6 *fl6) { return container_of(fl6, struct flowi, u.ip6); } static inline struct flowi_common *flowi6_to_flowi_common(struct flowi6 *fl6) { return &(fl6->__fl_common); } __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); #endif
8 8 1 2 3 4 5 6 7 8 9 10 11 12 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_CORE_SOCK_DESTRUCTOR_H #define _NET_CORE_SOCK_DESTRUCTOR_H #include <net/tcp.h> static inline bool is_skb_wmem(const struct sk_buff *skb) { return skb->destructor == sock_wfree || skb->destructor == __sock_wfree || (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree); } #endif
9 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 /* * linux/drivers/video/fb_defio.c * * Copyright (C) 2006 Jaya Kumar * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/fb.h> #include <linux/list.h> /* to support deferred IO */ #include <linux/rmap.h> #include <linux/pagemap.h> static struct page *fb_deferred_io_get_page(struct fb_info *info, unsigned long offs) { struct fb_deferred_io *fbdefio = info->fbdefio; const void *screen_buffer = info->screen_buffer; struct page *page = NULL; if (fbdefio->get_page) return fbdefio->get_page(info, offs); if (is_vmalloc_addr(screen_buffer + offs)) page = vmalloc_to_page(screen_buffer + offs); else if (info->fix.smem_start) page = pfn_to_page((info->fix.smem_start + offs) >> PAGE_SHIFT); if (page) get_page(page); return page; } static struct fb_deferred_io_pageref *fb_deferred_io_pageref_lookup(struct fb_info *info, unsigned long offset, struct page *page) { unsigned long pgoff = offset >> PAGE_SHIFT; struct fb_deferred_io_pageref *pageref; if (fb_WARN_ON_ONCE(info, pgoff >= info->npagerefs)) return NULL; /* incorrect allocation size */ /* 1:1 mapping between pageref and page offset */ pageref = &info->pagerefs[pgoff]; if (pageref->page) goto out; pageref->page = page; pageref->offset = pgoff << PAGE_SHIFT; INIT_LIST_HEAD(&pageref->list); out: if (fb_WARN_ON_ONCE(info, pageref->page != page)) return NULL; /* inconsistent state */ return pageref; } static struct fb_deferred_io_pageref *fb_deferred_io_pageref_get(struct fb_info *info, unsigned long offset, struct page *page) { struct fb_deferred_io *fbdefio = info->fbdefio; struct list_head *pos = &fbdefio->pagereflist; struct fb_deferred_io_pageref *pageref, *cur; pageref = fb_deferred_io_pageref_lookup(info, offset, page); if (!pageref) return NULL; /* * This check is to catch the case where a new process could start * writing to the same page through a new PTE. This new access * can cause a call to .page_mkwrite even if the original process' * PTE is marked writable. */ if (!list_empty(&pageref->list)) goto pageref_already_added; if (unlikely(fbdefio->sort_pagereflist)) { /* * We loop through the list of pagerefs before adding in * order to keep the pagerefs sorted. This has significant * overhead of O(n^2) with n being the number of written * pages. If possible, drivers should try to work with * unsorted page lists instead. */ list_for_each_entry(cur, &fbdefio->pagereflist, list) { if (cur->offset > pageref->offset) break; } pos = &cur->list; } list_add_tail(&pageref->list, pos); pageref_already_added: return pageref; } static void fb_deferred_io_pageref_put(struct fb_deferred_io_pageref *pageref, struct fb_info *info) { list_del_init(&pageref->list); } /* this is to find and return the vmalloc-ed fb pages */ static vm_fault_t fb_deferred_io_fault(struct vm_fault *vmf) { unsigned long offset; struct page *page; struct fb_info *info = vmf->vma->vm_private_data; offset = vmf->pgoff << PAGE_SHIFT; if (offset >= info->fix.smem_len) return VM_FAULT_SIGBUS; page = fb_deferred_io_get_page(info, offset); if (!page) return VM_FAULT_SIGBUS; if (!vmf->vma->vm_file) fb_err(info, "no mapping available\n"); BUG_ON(!info->fbdefio->mapping); vmf->page = page; return 0; } int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fb_info *info = file->private_data; struct inode *inode = file_inode(file); int err = file_write_and_wait_range(file, start, end); if (err) return err; /* Skip if deferred io is compiled-in but disabled on this fbdev */ if (!info->fbdefio) return 0; inode_lock(inode); flush_delayed_work(&info->deferred_work); inode_unlock(inode); return 0; } EXPORT_SYMBOL_GPL(fb_deferred_io_fsync); /* * Adds a page to the dirty list. Call this from struct * vm_operations_struct.page_mkwrite. */ static vm_fault_t fb_deferred_io_track_page(struct fb_info *info, unsigned long offset, struct page *page) { struct fb_deferred_io *fbdefio = info->fbdefio; struct fb_deferred_io_pageref *pageref; vm_fault_t ret; /* protect against the workqueue changing the page list */ mutex_lock(&fbdefio->lock); pageref = fb_deferred_io_pageref_get(info, offset, page); if (WARN_ON_ONCE(!pageref)) { ret = VM_FAULT_OOM; goto err_mutex_unlock; } /* * We want the page to remain locked from ->page_mkwrite until * the PTE is marked dirty to avoid mapping_wrprotect_range() * being called before the PTE is updated, which would leave * the page ignored by defio. * Do this by locking the page here and informing the caller * about it with VM_FAULT_LOCKED. */ lock_page(pageref->page); mutex_unlock(&fbdefio->lock); /* come back after delay to process the deferred IO */ schedule_delayed_work(&info->deferred_work, fbdefio->delay); return VM_FAULT_LOCKED; err_mutex_unlock: mutex_unlock(&fbdefio->lock); return ret; } /* * fb_deferred_io_page_mkwrite - Mark a page as written for deferred I/O * @fb_info: The fbdev info structure * @vmf: The VM fault * * This is a callback we get when userspace first tries to * write to the page. We schedule a workqueue. That workqueue * will eventually mkclean the touched pages and execute the * deferred framebuffer IO. Then if userspace touches a page * again, we repeat the same scheme. * * Returns: * VM_FAULT_LOCKED on success, or a VM_FAULT error otherwise. */ static vm_fault_t fb_deferred_io_page_mkwrite(struct fb_info *info, struct vm_fault *vmf) { unsigned long offset = vmf->pgoff << PAGE_SHIFT; struct page *page = vmf->page; file_update_time(vmf->vma->vm_file); return fb_deferred_io_track_page(info, offset, page); } /* vm_ops->page_mkwrite handler */ static vm_fault_t fb_deferred_io_mkwrite(struct vm_fault *vmf) { struct fb_info *info = vmf->vma->vm_private_data; return fb_deferred_io_page_mkwrite(info, vmf); } static const struct vm_operations_struct fb_deferred_io_vm_ops = { .fault = fb_deferred_io_fault, .page_mkwrite = fb_deferred_io_mkwrite, }; static const struct address_space_operations fb_deferred_io_aops = { .dirty_folio = noop_dirty_folio, }; int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) { vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); vma->vm_ops = &fb_deferred_io_vm_ops; vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); if (!(info->flags & FBINFO_VIRTFB)) vm_flags_set(vma, VM_IO); vma->vm_private_data = info; return 0; } EXPORT_SYMBOL_GPL(fb_deferred_io_mmap); /* workqueue callback */ static void fb_deferred_io_work(struct work_struct *work) { struct fb_info *info = container_of(work, struct fb_info, deferred_work.work); struct fb_deferred_io_pageref *pageref, *next; struct fb_deferred_io *fbdefio = info->fbdefio; /* here we wrprotect the page's mappings, then do all deferred IO. */ mutex_lock(&fbdefio->lock); #ifdef CONFIG_MMU list_for_each_entry(pageref, &fbdefio->pagereflist, list) { struct page *page = pageref->page; pgoff_t pgoff = pageref->offset >> PAGE_SHIFT; mapping_wrprotect_range(fbdefio->mapping, pgoff, page_to_pfn(page), 1); } #endif /* driver's callback with pagereflist */ fbdefio->deferred_io(info, &fbdefio->pagereflist); /* clear the list */ list_for_each_entry_safe(pageref, next, &fbdefio->pagereflist, list) fb_deferred_io_pageref_put(pageref, info); mutex_unlock(&fbdefio->lock); } int fb_deferred_io_init(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; struct fb_deferred_io_pageref *pagerefs; unsigned long npagerefs; int ret; BUG_ON(!fbdefio); if (WARN_ON(!info->fix.smem_len)) return -EINVAL; mutex_init(&fbdefio->lock); INIT_DELAYED_WORK(&info->deferred_work, fb_deferred_io_work); INIT_LIST_HEAD(&fbdefio->pagereflist); if (fbdefio->delay == 0) /* set a default of 1 s */ fbdefio->delay = HZ; npagerefs = DIV_ROUND_UP(info->fix.smem_len, PAGE_SIZE); /* alloc a page ref for each page of the display memory */ pagerefs = kvcalloc(npagerefs, sizeof(*pagerefs), GFP_KERNEL); if (!pagerefs) { ret = -ENOMEM; goto err; } info->npagerefs = npagerefs; info->pagerefs = pagerefs; return 0; err: mutex_destroy(&fbdefio->lock); return ret; } EXPORT_SYMBOL_GPL(fb_deferred_io_init); void fb_deferred_io_open(struct fb_info *info, struct inode *inode, struct file *file) { struct fb_deferred_io *fbdefio = info->fbdefio; fbdefio->mapping = file->f_mapping; file->f_mapping->a_ops = &fb_deferred_io_aops; fbdefio->open_count++; } EXPORT_SYMBOL_GPL(fb_deferred_io_open); static void fb_deferred_io_lastclose(struct fb_info *info) { flush_delayed_work(&info->deferred_work); } void fb_deferred_io_release(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; if (!--fbdefio->open_count) fb_deferred_io_lastclose(info); } EXPORT_SYMBOL_GPL(fb_deferred_io_release); void fb_deferred_io_cleanup(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; fb_deferred_io_lastclose(info); kvfree(info->pagerefs); mutex_destroy(&fbdefio->lock); fbdefio->mapping = NULL; } EXPORT_SYMBOL_GPL(fb_deferred_io_cleanup);
18 18 18 18 17 17 11 11 11 20 19 11 20 7 20 12 12 20 20 12 12 12 27 3 27 27 3 27 26 27 27 15 12 8 8 8 7 7 7 15 3 3 2 2 2 2 15 5 5 5 5 5 1 4 5 5 5 5 5 3 3 2 2 27 27 27 27 27 27 11 2 25 25 25 25 24 23 22 3 22 7 4 15 19 19 19 19 19 19 5 19 5 19 8 3 11 19 2 2 16 19 12 11 7 18 17 17 16 16 16 16 16 7 9 9 9 25 11 11 10 11 11 11 11 11 5 11 11 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 // SPDX-License-Identifier: GPL-2.0 /* * USB Serial Converter driver * * Copyright (C) 2009 - 2013 Johan Hovold (jhovold@gmail.com) * Copyright (C) 1999 - 2012 Greg Kroah-Hartman (greg@kroah.com) * Copyright (C) 2000 Peter Berger (pberger@brimson.com) * Copyright (C) 2000 Al Borchers (borchers@steinerpoint.com) * * This driver was originally based on the ACM driver by Armin Fuerst (which was * based on a driver by Brad Keryan) * * See Documentation/usb/usb-serial.rst for more information on using this * driver */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/seq_file.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/serial.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/kfifo.h> #include <linux/idr.h> #define DRIVER_AUTHOR "Greg Kroah-Hartman <gregkh@linuxfoundation.org>" #define DRIVER_DESC "USB Serial Driver core" #define USB_SERIAL_TTY_MAJOR 188 #define USB_SERIAL_TTY_MINORS 512 /* should be enough for a while */ /* There is no MODULE_DEVICE_TABLE for usbserial.c. Instead the MODULE_DEVICE_TABLE declarations in each serial driver cause the "hotplug" program to pull in whatever module is necessary via modprobe, and modprobe will load usbserial because the serial drivers depend on it. */ static DEFINE_IDR(serial_minors); static DEFINE_MUTEX(table_lock); static LIST_HEAD(usb_serial_driver_list); /* * Look up the serial port structure. If it is found and it hasn't been * disconnected, return with the parent usb_serial structure's disc_mutex held * and its refcount incremented. Otherwise return NULL. */ struct usb_serial_port *usb_serial_port_get_by_minor(unsigned minor) { struct usb_serial *serial; struct usb_serial_port *port; mutex_lock(&table_lock); port = idr_find(&serial_minors, minor); if (!port) goto exit; serial = port->serial; mutex_lock(&serial->disc_mutex); if (serial->disconnected) { mutex_unlock(&serial->disc_mutex); port = NULL; } else { kref_get(&serial->kref); } exit: mutex_unlock(&table_lock); return port; } static int allocate_minors(struct usb_serial *serial, int num_ports) { struct usb_serial_port *port; unsigned int i, j; int minor; dev_dbg(&serial->interface->dev, "%s %d\n", __func__, num_ports); mutex_lock(&table_lock); for (i = 0; i < num_ports; ++i) { port = serial->port[i]; minor = idr_alloc(&serial_minors, port, 0, USB_SERIAL_TTY_MINORS, GFP_KERNEL); if (minor < 0) goto error; port->minor = minor; port->port_number = i; } serial->minors_reserved = 1; mutex_unlock(&table_lock); return 0; error: /* unwind the already allocated minors */ for (j = 0; j < i; ++j) idr_remove(&serial_minors, serial->port[j]->minor); mutex_unlock(&table_lock); return minor; } static void release_minors(struct usb_serial *serial) { int i; mutex_lock(&table_lock); for (i = 0; i < serial->num_ports; ++i) idr_remove(&serial_minors, serial->port[i]->minor); mutex_unlock(&table_lock); serial->minors_reserved = 0; } int usb_serial_claim_interface(struct usb_serial *serial, struct usb_interface *intf) { struct usb_driver *driver = serial->type->usb_driver; int ret; if (serial->sibling) return -EBUSY; ret = usb_driver_claim_interface(driver, intf, serial); if (ret) { dev_err(&serial->interface->dev, "failed to claim sibling interface: %d\n", ret); return ret; } serial->sibling = intf; return 0; } EXPORT_SYMBOL_GPL(usb_serial_claim_interface); static void release_sibling(struct usb_serial *serial, struct usb_interface *intf) { struct usb_driver *driver = serial->type->usb_driver; struct usb_interface *sibling; if (!serial->sibling) return; if (intf == serial->sibling) sibling = serial->interface; else sibling = serial->sibling; usb_set_intfdata(sibling, NULL); usb_driver_release_interface(driver, sibling); } static void destroy_serial(struct kref *kref) { struct usb_serial *serial; struct usb_serial_port *port; int i; serial = to_usb_serial(kref); /* return the minor range that this device had */ if (serial->minors_reserved) release_minors(serial); if (serial->attached && serial->type->release) serial->type->release(serial); /* Now that nothing is using the ports, they can be freed */ for (i = 0; i < serial->num_port_pointers; ++i) { port = serial->port[i]; if (port) { port->serial = NULL; put_device(&port->dev); } } usb_put_intf(serial->interface); usb_put_dev(serial->dev); kfree(serial); } void usb_serial_put(struct usb_serial *serial) { kref_put(&serial->kref, destroy_serial); } /***************************************************************************** * Driver tty interface functions *****************************************************************************/ /** * serial_install - install tty * @driver: the driver (USB in our case) * @tty: the tty being created * * Initialise the termios structure for this tty. We use the default * USB serial settings but permit them to be overridden by * serial->type->init_termios on first open. * * This is the first place a new tty gets used. Hence this is where we * acquire references to the usb_serial structure and the driver module, * where we store a pointer to the port. All these actions are reversed * in serial_cleanup(). */ static int serial_install(struct tty_driver *driver, struct tty_struct *tty) { int idx = tty->index; struct usb_serial *serial; struct usb_serial_port *port; bool init_termios; int retval = -ENODEV; port = usb_serial_port_get_by_minor(idx); if (!port) return retval; serial = port->serial; if (!try_module_get(serial->type->driver.owner)) goto err_put_serial; init_termios = (driver->termios[idx] == NULL); retval = tty_standard_install(driver, tty); if (retval) goto err_put_module; mutex_unlock(&serial->disc_mutex); /* allow the driver to update the initial settings */ if (init_termios && serial->type->init_termios) serial->type->init_termios(tty); tty->driver_data = port; return retval; err_put_module: module_put(serial->type->driver.owner); err_put_serial: usb_serial_put(serial); mutex_unlock(&serial->disc_mutex); return retval; } static int serial_port_activate(struct tty_port *tport, struct tty_struct *tty) { struct usb_serial_port *port = container_of(tport, struct usb_serial_port, port); struct usb_serial *serial = port->serial; int retval; mutex_lock(&serial->disc_mutex); if (serial->disconnected) { retval = -ENODEV; goto out_unlock; } retval = usb_autopm_get_interface(serial->interface); if (retval) goto out_unlock; retval = port->serial->type->open(tty, port); if (retval) usb_autopm_put_interface(serial->interface); out_unlock: mutex_unlock(&serial->disc_mutex); if (retval < 0) retval = usb_translate_errors(retval); return retval; } static int serial_open(struct tty_struct *tty, struct file *filp) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); return tty_port_open(&port->port, tty, filp); } /** * serial_port_shutdown - shut down hardware * @tport: tty port to shut down * * Shut down a USB serial port. Serialized against activate by the * tport mutex and kept to matching open/close pairs * of calls by the tty-port initialized flag. * * Not called if tty is console. */ static void serial_port_shutdown(struct tty_port *tport) { struct usb_serial_port *port = container_of(tport, struct usb_serial_port, port); struct usb_serial_driver *drv = port->serial->type; if (drv->close) drv->close(port); usb_autopm_put_interface(port->serial->interface); } static void serial_hangup(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); tty_port_hangup(&port->port); } static void serial_close(struct tty_struct *tty, struct file *filp) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); tty_port_close(&port->port, tty, filp); } /** * serial_cleanup - free resources post close/hangup * @tty: tty to clean up * * Do the resource freeing and refcount dropping for the port. * Avoid freeing the console. * * Called asynchronously after the last tty kref is dropped. */ static void serial_cleanup(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial; struct module *owner; dev_dbg(&port->dev, "%s\n", __func__); /* The console is magical. Do not hang up the console hardware * or there will be tears. */ if (port->port.console) return; tty->driver_data = NULL; serial = port->serial; owner = serial->type->driver.owner; usb_serial_put(serial); module_put(owner); } static ssize_t serial_write(struct tty_struct *tty, const u8 *buf, size_t count) { struct usb_serial_port *port = tty->driver_data; int retval = -ENODEV; if (port->serial->dev->state == USB_STATE_NOTATTACHED) goto exit; dev_dbg(&port->dev, "%s - %zu byte(s)\n", __func__, count); retval = port->serial->type->write(tty, port, buf, count); if (retval < 0) retval = usb_translate_errors(retval); exit: return retval; } static unsigned int serial_write_room(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); return port->serial->type->write_room(tty); } static unsigned int serial_chars_in_buffer(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; dev_dbg(&port->dev, "%s\n", __func__); if (serial->disconnected) return 0; return serial->type->chars_in_buffer(tty); } static void serial_wait_until_sent(struct tty_struct *tty, int timeout) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; dev_dbg(&port->dev, "%s\n", __func__); if (!port->serial->type->wait_until_sent) return; mutex_lock(&serial->disc_mutex); if (!serial->disconnected) port->serial->type->wait_until_sent(tty, timeout); mutex_unlock(&serial->disc_mutex); } static void serial_throttle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->throttle) port->serial->type->throttle(tty); } static void serial_unthrottle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->unthrottle) port->serial->type->unthrottle(tty); } static int serial_get_serial(struct tty_struct *tty, struct serial_struct *ss) { struct usb_serial_port *port = tty->driver_data; struct tty_port *tport = &port->port; unsigned int close_delay, closing_wait; mutex_lock(&tport->mutex); close_delay = jiffies_to_msecs(tport->close_delay) / 10; closing_wait = tport->closing_wait; if (closing_wait != ASYNC_CLOSING_WAIT_NONE) closing_wait = jiffies_to_msecs(closing_wait) / 10; ss->line = port->minor; ss->close_delay = close_delay; ss->closing_wait = closing_wait; if (port->serial->type->get_serial) port->serial->type->get_serial(tty, ss); mutex_unlock(&tport->mutex); return 0; } static int serial_set_serial(struct tty_struct *tty, struct serial_struct *ss) { struct usb_serial_port *port = tty->driver_data; struct tty_port *tport = &port->port; unsigned int close_delay, closing_wait; int ret = 0; close_delay = msecs_to_jiffies(ss->close_delay * 10); closing_wait = ss->closing_wait; if (closing_wait != ASYNC_CLOSING_WAIT_NONE) closing_wait = msecs_to_jiffies(closing_wait * 10); mutex_lock(&tport->mutex); if (!capable(CAP_SYS_ADMIN)) { if (close_delay != tport->close_delay || closing_wait != tport->closing_wait) { ret = -EPERM; goto out_unlock; } } if (port->serial->type->set_serial) { ret = port->serial->type->set_serial(tty, ss); if (ret) goto out_unlock; } tport->close_delay = close_delay; tport->closing_wait = closing_wait; out_unlock: mutex_unlock(&tport->mutex); return ret; } static int serial_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct usb_serial_port *port = tty->driver_data; int retval = -ENOIOCTLCMD; dev_dbg(&port->dev, "%s - cmd 0x%04x\n", __func__, cmd); switch (cmd) { case TIOCMIWAIT: if (port->serial->type->tiocmiwait) retval = port->serial->type->tiocmiwait(tty, arg); break; default: if (port->serial->type->ioctl) retval = port->serial->type->ioctl(tty, cmd, arg); } return retval; } static void serial_set_termios(struct tty_struct *tty, const struct ktermios *old) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->set_termios) port->serial->type->set_termios(tty, port, old); else tty_termios_copy_hw(&tty->termios, old); } static int serial_break(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->break_ctl) return port->serial->type->break_ctl(tty, break_state); return -ENOTTY; } static int serial_proc_show(struct seq_file *m, void *v) { struct usb_serial *serial; struct usb_serial_port *port; int i; char tmp[40]; seq_puts(m, "usbserinfo:1.0 driver:2.0\n"); for (i = 0; i < USB_SERIAL_TTY_MINORS; ++i) { port = usb_serial_port_get_by_minor(i); if (port == NULL) continue; serial = port->serial; seq_printf(m, "%d:", i); if (serial->type->driver.owner) seq_printf(m, " module:%s", module_name(serial->type->driver.owner)); seq_printf(m, " name:\"%s\"", serial->type->description); seq_printf(m, " vendor:%04x product:%04x", le16_to_cpu(serial->dev->descriptor.idVendor), le16_to_cpu(serial->dev->descriptor.idProduct)); seq_printf(m, " num_ports:%d", serial->num_ports); seq_printf(m, " port:%d", port->port_number); usb_make_path(serial->dev, tmp, sizeof(tmp)); seq_printf(m, " path:%s", tmp); seq_putc(m, '\n'); usb_serial_put(serial); mutex_unlock(&serial->disc_mutex); } return 0; } static int serial_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->tiocmget) return port->serial->type->tiocmget(tty); return -ENOTTY; } static int serial_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->tiocmset) return port->serial->type->tiocmset(tty, set, clear); return -ENOTTY; } static int serial_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { struct usb_serial_port *port = tty->driver_data; dev_dbg(&port->dev, "%s\n", __func__); if (port->serial->type->get_icount) return port->serial->type->get_icount(tty, icount); return -ENOTTY; } /* * We would be calling tty_wakeup here, but unfortunately some line * disciplines have an annoying habit of calling tty->write from * the write wakeup callback (e.g. n_hdlc.c). */ void usb_serial_port_softint(struct usb_serial_port *port) { schedule_work(&port->work); } EXPORT_SYMBOL_GPL(usb_serial_port_softint); static void usb_serial_port_work(struct work_struct *work) { struct usb_serial_port *port = container_of(work, struct usb_serial_port, work); tty_port_tty_wakeup(&port->port); } static void usb_serial_port_poison_urbs(struct usb_serial_port *port) { int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) usb_poison_urb(port->read_urbs[i]); for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) usb_poison_urb(port->write_urbs[i]); usb_poison_urb(port->interrupt_in_urb); usb_poison_urb(port->interrupt_out_urb); } static void usb_serial_port_unpoison_urbs(struct usb_serial_port *port) { int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) usb_unpoison_urb(port->read_urbs[i]); for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) usb_unpoison_urb(port->write_urbs[i]); usb_unpoison_urb(port->interrupt_in_urb); usb_unpoison_urb(port->interrupt_out_urb); } static void usb_serial_port_release(struct device *dev) { struct usb_serial_port *port = to_usb_serial_port(dev); int i; dev_dbg(dev, "%s\n", __func__); usb_free_urb(port->interrupt_in_urb); usb_free_urb(port->interrupt_out_urb); for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { usb_free_urb(port->read_urbs[i]); kfree(port->bulk_in_buffers[i]); } for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) { usb_free_urb(port->write_urbs[i]); kfree(port->bulk_out_buffers[i]); } kfifo_free(&port->write_fifo); kfree(port->interrupt_in_buffer); kfree(port->interrupt_out_buffer); tty_port_destroy(&port->port); kfree(port); } static struct usb_serial *create_serial(struct usb_device *dev, struct usb_interface *interface, struct usb_serial_driver *driver) { struct usb_serial *serial; serial = kzalloc(sizeof(*serial), GFP_KERNEL); if (!serial) return NULL; serial->dev = usb_get_dev(dev); serial->type = driver; serial->interface = usb_get_intf(interface); kref_init(&serial->kref); mutex_init(&serial->disc_mutex); serial->minors_reserved = 0; return serial; } static const struct usb_device_id *match_dynamic_id(struct usb_interface *intf, struct usb_serial_driver *drv) { struct usb_dynid *dynid; guard(mutex)(&usb_dynids_lock); list_for_each_entry(dynid, &drv->dynids.list, node) { if (usb_match_one_id(intf, &dynid->id)) { return &dynid->id; } } return NULL; } static const struct usb_device_id *get_iface_id(struct usb_serial_driver *drv, struct usb_interface *intf) { const struct usb_device_id *id; id = usb_match_id(intf, drv->id_table); if (id) { dev_dbg(&intf->dev, "static descriptor matches\n"); goto exit; } id = match_dynamic_id(intf, drv); if (id) dev_dbg(&intf->dev, "dynamic descriptor matches\n"); exit: return id; } /* Caller must hold table_lock */ static struct usb_serial_driver *search_serial_device( struct usb_interface *iface) { const struct usb_device_id *id = NULL; struct usb_serial_driver *drv; struct usb_driver *driver = to_usb_driver(iface->dev.driver); /* Check if the usb id matches a known device */ list_for_each_entry(drv, &usb_serial_driver_list, driver_list) { if (drv->usb_driver == driver) id = get_iface_id(drv, iface); if (id) return drv; } return NULL; } static bool serial_port_carrier_raised(struct tty_port *port) { struct usb_serial_port *p = container_of(port, struct usb_serial_port, port); struct usb_serial_driver *drv = p->serial->type; if (drv->carrier_raised) return drv->carrier_raised(p); /* No carrier control - don't block */ return true; } static void serial_port_dtr_rts(struct tty_port *port, bool on) { struct usb_serial_port *p = container_of(port, struct usb_serial_port, port); struct usb_serial_driver *drv = p->serial->type; if (drv->dtr_rts) drv->dtr_rts(p, on); } static ssize_t port_number_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_serial_port *port = to_usb_serial_port(dev); return sprintf(buf, "%u\n", port->port_number); } static DEVICE_ATTR_RO(port_number); static struct attribute *usb_serial_port_attrs[] = { &dev_attr_port_number.attr, NULL }; ATTRIBUTE_GROUPS(usb_serial_port); static const struct tty_port_operations serial_port_ops = { .carrier_raised = serial_port_carrier_raised, .dtr_rts = serial_port_dtr_rts, .activate = serial_port_activate, .shutdown = serial_port_shutdown, }; static void store_endpoint(struct usb_serial *serial, struct usb_serial_endpoints *epds, struct usb_endpoint_descriptor *epd) { struct device *dev = &serial->interface->dev; u8 addr = epd->bEndpointAddress; if (usb_endpoint_is_bulk_in(epd)) { if (epds->num_bulk_in == ARRAY_SIZE(epds->bulk_in)) return; dev_dbg(dev, "found bulk in endpoint %02x\n", addr); epds->bulk_in[epds->num_bulk_in++] = epd; } else if (usb_endpoint_is_bulk_out(epd)) { if (epds->num_bulk_out == ARRAY_SIZE(epds->bulk_out)) return; dev_dbg(dev, "found bulk out endpoint %02x\n", addr); epds->bulk_out[epds->num_bulk_out++] = epd; } else if (usb_endpoint_is_int_in(epd)) { if (epds->num_interrupt_in == ARRAY_SIZE(epds->interrupt_in)) return; dev_dbg(dev, "found interrupt in endpoint %02x\n", addr); epds->interrupt_in[epds->num_interrupt_in++] = epd; } else if (usb_endpoint_is_int_out(epd)) { if (epds->num_interrupt_out == ARRAY_SIZE(epds->interrupt_out)) return; dev_dbg(dev, "found interrupt out endpoint %02x\n", addr); epds->interrupt_out[epds->num_interrupt_out++] = epd; } } static void find_endpoints(struct usb_serial *serial, struct usb_serial_endpoints *epds, struct usb_interface *intf) { struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *epd; unsigned int i; iface_desc = intf->cur_altsetting; for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { epd = &iface_desc->endpoint[i].desc; store_endpoint(serial, epds, epd); } } static int setup_port_bulk_in(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; int i; buffer_size = max_t(int, type->bulk_in_size, usb_endpoint_maxp(epd)); port->bulk_in_size = buffer_size; port->bulk_in_endpointAddress = epd->bEndpointAddress; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) { set_bit(i, &port->read_urbs_free); port->read_urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!port->read_urbs[i]) return -ENOMEM; port->bulk_in_buffers[i] = kmalloc(buffer_size, GFP_KERNEL); if (!port->bulk_in_buffers[i]) return -ENOMEM; usb_fill_bulk_urb(port->read_urbs[i], udev, usb_rcvbulkpipe(udev, epd->bEndpointAddress), port->bulk_in_buffers[i], buffer_size, type->read_bulk_callback, port); } port->read_urb = port->read_urbs[0]; port->bulk_in_buffer = port->bulk_in_buffers[0]; return 0; } static int setup_port_bulk_out(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; int i; if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL)) return -ENOMEM; if (type->bulk_out_size) buffer_size = type->bulk_out_size; else buffer_size = usb_endpoint_maxp(epd); port->bulk_out_size = buffer_size; port->bulk_out_endpointAddress = epd->bEndpointAddress; for (i = 0; i < ARRAY_SIZE(port->write_urbs); ++i) { set_bit(i, &port->write_urbs_free); port->write_urbs[i] = usb_alloc_urb(0, GFP_KERNEL); if (!port->write_urbs[i]) return -ENOMEM; port->bulk_out_buffers[i] = kmalloc(buffer_size, GFP_KERNEL); if (!port->bulk_out_buffers[i]) return -ENOMEM; usb_fill_bulk_urb(port->write_urbs[i], udev, usb_sndbulkpipe(udev, epd->bEndpointAddress), port->bulk_out_buffers[i], buffer_size, type->write_bulk_callback, port); } port->write_urb = port->write_urbs[0]; port->bulk_out_buffer = port->bulk_out_buffers[0]; return 0; } static int setup_port_interrupt_in(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; port->interrupt_in_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port->interrupt_in_urb) return -ENOMEM; buffer_size = usb_endpoint_maxp(epd); port->interrupt_in_endpointAddress = epd->bEndpointAddress; port->interrupt_in_buffer = kmalloc(buffer_size, GFP_KERNEL); if (!port->interrupt_in_buffer) return -ENOMEM; usb_fill_int_urb(port->interrupt_in_urb, udev, usb_rcvintpipe(udev, epd->bEndpointAddress), port->interrupt_in_buffer, buffer_size, type->read_int_callback, port, epd->bInterval); return 0; } static int setup_port_interrupt_out(struct usb_serial_port *port, struct usb_endpoint_descriptor *epd) { struct usb_serial_driver *type = port->serial->type; struct usb_device *udev = port->serial->dev; int buffer_size; port->interrupt_out_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port->interrupt_out_urb) return -ENOMEM; buffer_size = usb_endpoint_maxp(epd); port->interrupt_out_size = buffer_size; port->interrupt_out_endpointAddress = epd->bEndpointAddress; port->interrupt_out_buffer = kmalloc(buffer_size, GFP_KERNEL); if (!port->interrupt_out_buffer) return -ENOMEM; usb_fill_int_urb(port->interrupt_out_urb, udev, usb_sndintpipe(udev, epd->bEndpointAddress), port->interrupt_out_buffer, buffer_size, type->write_int_callback, port, epd->bInterval); return 0; } static int usb_serial_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct device *ddev = &interface->dev; struct usb_device *dev = interface_to_usbdev(interface); struct usb_serial *serial = NULL; struct usb_serial_port *port; struct usb_serial_endpoints *epds; struct usb_serial_driver *type = NULL; int retval; int i; int num_ports = 0; unsigned char max_endpoints; mutex_lock(&table_lock); type = search_serial_device(interface); if (!type) { mutex_unlock(&table_lock); dev_dbg(ddev, "none matched\n"); return -ENODEV; } if (!try_module_get(type->driver.owner)) { mutex_unlock(&table_lock); dev_err(ddev, "module get failed, exiting\n"); return -EIO; } mutex_unlock(&table_lock); serial = create_serial(dev, interface, type); if (!serial) { retval = -ENOMEM; goto err_put_module; } /* if this device type has a probe function, call it */ if (type->probe) { const struct usb_device_id *id; id = get_iface_id(type, interface); retval = type->probe(serial, id); if (retval) { dev_dbg(ddev, "sub driver rejected device\n"); goto err_release_sibling; } } /* descriptor matches, let's find the endpoints needed */ epds = kzalloc(sizeof(*epds), GFP_KERNEL); if (!epds) { retval = -ENOMEM; goto err_release_sibling; } find_endpoints(serial, epds, interface); if (serial->sibling) find_endpoints(serial, epds, serial->sibling); if (epds->num_bulk_in < type->num_bulk_in || epds->num_bulk_out < type->num_bulk_out || epds->num_interrupt_in < type->num_interrupt_in || epds->num_interrupt_out < type->num_interrupt_out) { dev_err(ddev, "required endpoints missing\n"); retval = -ENODEV; goto err_free_epds; } if (type->calc_num_ports) { retval = type->calc_num_ports(serial, epds); if (retval < 0) goto err_free_epds; num_ports = retval; } if (!num_ports) num_ports = type->num_ports; if (num_ports > MAX_NUM_PORTS) { dev_warn(ddev, "too many ports requested: %d\n", num_ports); num_ports = MAX_NUM_PORTS; } serial->num_ports = (unsigned char)num_ports; serial->num_bulk_in = epds->num_bulk_in; serial->num_bulk_out = epds->num_bulk_out; serial->num_interrupt_in = epds->num_interrupt_in; serial->num_interrupt_out = epds->num_interrupt_out; /* found all that we need */ dev_info(ddev, "%s converter detected\n", type->description); /* create our ports, we need as many as the max endpoints */ /* we don't use num_ports here because some devices have more endpoint pairs than ports */ max_endpoints = max(epds->num_bulk_in, epds->num_bulk_out); max_endpoints = max(max_endpoints, epds->num_interrupt_in); max_endpoints = max(max_endpoints, epds->num_interrupt_out); max_endpoints = max(max_endpoints, serial->num_ports); serial->num_port_pointers = max_endpoints; dev_dbg(ddev, "setting up %d port structure(s)\n", max_endpoints); for (i = 0; i < max_endpoints; ++i) { port = kzalloc(sizeof(struct usb_serial_port), GFP_KERNEL); if (!port) { retval = -ENOMEM; goto err_free_epds; } tty_port_init(&port->port); port->port.ops = &serial_port_ops; port->serial = serial; spin_lock_init(&port->lock); /* Keep this for private driver use for the moment but should probably go away */ INIT_WORK(&port->work, usb_serial_port_work); serial->port[i] = port; port->dev.parent = &interface->dev; port->dev.driver = NULL; port->dev.bus = &usb_serial_bus_type; port->dev.release = &usb_serial_port_release; port->dev.groups = usb_serial_port_groups; device_initialize(&port->dev); } /* set up the endpoint information */ for (i = 0; i < epds->num_bulk_in; ++i) { retval = setup_port_bulk_in(serial->port[i], epds->bulk_in[i]); if (retval) goto err_free_epds; } for (i = 0; i < epds->num_bulk_out; ++i) { retval = setup_port_bulk_out(serial->port[i], epds->bulk_out[i]); if (retval) goto err_free_epds; } if (serial->type->read_int_callback) { for (i = 0; i < epds->num_interrupt_in; ++i) { retval = setup_port_interrupt_in(serial->port[i], epds->interrupt_in[i]); if (retval) goto err_free_epds; } } else if (epds->num_interrupt_in) { dev_dbg(ddev, "The device claims to support interrupt in transfers, but read_int_callback is not defined\n"); } if (serial->type->write_int_callback) { for (i = 0; i < epds->num_interrupt_out; ++i) { retval = setup_port_interrupt_out(serial->port[i], epds->interrupt_out[i]); if (retval) goto err_free_epds; } } else if (epds->num_interrupt_out) { dev_dbg(ddev, "The device claims to support interrupt out transfers, but write_int_callback is not defined\n"); } usb_set_intfdata(interface, serial); /* if this device type has an attach function, call it */ if (type->attach) { retval = type->attach(serial); if (retval < 0) goto err_free_epds; serial->attached = 1; if (retval > 0) { /* quietly accept this device, but don't bind to a serial port as it's about to disappear */ serial->num_ports = 0; goto exit; } } else { serial->attached = 1; } retval = allocate_minors(serial, num_ports); if (retval) { dev_err(ddev, "No more free serial minor numbers\n"); goto err_free_epds; } /* register all of the individual ports with the driver core */ for (i = 0; i < num_ports; ++i) { port = serial->port[i]; dev_set_name(&port->dev, "ttyUSB%d", port->minor); dev_dbg(ddev, "registering %s\n", dev_name(&port->dev)); device_enable_async_suspend(&port->dev); retval = device_add(&port->dev); if (retval) dev_err(ddev, "Error registering port device, continuing\n"); } if (num_ports > 0) usb_serial_console_init(serial->port[0]->minor); exit: kfree(epds); module_put(type->driver.owner); return 0; err_free_epds: kfree(epds); err_release_sibling: release_sibling(serial, interface); usb_serial_put(serial); err_put_module: module_put(type->driver.owner); return retval; } static void usb_serial_disconnect(struct usb_interface *interface) { int i; struct usb_serial *serial = usb_get_intfdata(interface); struct device *dev = &interface->dev; struct usb_serial_port *port; /* sibling interface is cleaning up */ if (!serial) return; usb_serial_console_disconnect(serial); mutex_lock(&serial->disc_mutex); /* must set a flag, to signal subdrivers */ serial->disconnected = 1; mutex_unlock(&serial->disc_mutex); for (i = 0; i < serial->num_ports; ++i) { port = serial->port[i]; tty_port_tty_vhangup(&port->port); usb_serial_port_poison_urbs(port); wake_up_interruptible(&port->port.delta_msr_wait); cancel_work_sync(&port->work); if (device_is_registered(&port->dev)) device_del(&port->dev); } if (serial->type->disconnect) serial->type->disconnect(serial); release_sibling(serial, interface); /* let the last holder of this object cause it to be cleaned up */ usb_serial_put(serial); dev_info(dev, "device disconnected\n"); } int usb_serial_suspend(struct usb_interface *intf, pm_message_t message) { struct usb_serial *serial = usb_get_intfdata(intf); int i, r; /* suspend when called for first sibling interface */ if (serial->suspend_count++) return 0; /* * serial->type->suspend() MUST return 0 in system sleep context, * otherwise, the resume callback has to recover device from * previous suspend failure. */ if (serial->type->suspend) { r = serial->type->suspend(serial, message); if (r < 0) { serial->suspend_count--; return r; } } for (i = 0; i < serial->num_ports; ++i) usb_serial_port_poison_urbs(serial->port[i]); return 0; } EXPORT_SYMBOL(usb_serial_suspend); static void usb_serial_unpoison_port_urbs(struct usb_serial *serial) { int i; for (i = 0; i < serial->num_ports; ++i) usb_serial_port_unpoison_urbs(serial->port[i]); } int usb_serial_resume(struct usb_interface *intf) { struct usb_serial *serial = usb_get_intfdata(intf); int rv; /* resume when called for last sibling interface */ if (--serial->suspend_count) return 0; usb_serial_unpoison_port_urbs(serial); if (serial->type->resume) rv = serial->type->resume(serial); else rv = usb_serial_generic_resume(serial); return rv; } EXPORT_SYMBOL(usb_serial_resume); static int usb_serial_reset_resume(struct usb_interface *intf) { struct usb_serial *serial = usb_get_intfdata(intf); int rv; /* resume when called for last sibling interface */ if (--serial->suspend_count) return 0; usb_serial_unpoison_port_urbs(serial); if (serial->type->reset_resume) { rv = serial->type->reset_resume(serial); } else { rv = -EOPNOTSUPP; intf->needs_binding = 1; } return rv; } static const struct tty_operations serial_ops = { .open = serial_open, .close = serial_close, .write = serial_write, .hangup = serial_hangup, .write_room = serial_write_room, .ioctl = serial_ioctl, .set_termios = serial_set_termios, .throttle = serial_throttle, .unthrottle = serial_unthrottle, .break_ctl = serial_break, .chars_in_buffer = serial_chars_in_buffer, .wait_until_sent = serial_wait_until_sent, .tiocmget = serial_tiocmget, .tiocmset = serial_tiocmset, .get_icount = serial_get_icount, .set_serial = serial_set_serial, .get_serial = serial_get_serial, .cleanup = serial_cleanup, .install = serial_install, .proc_show = serial_proc_show, }; struct tty_driver *usb_serial_tty_driver; static int __init usb_serial_init(void) { int result; usb_serial_tty_driver = tty_alloc_driver(USB_SERIAL_TTY_MINORS, TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV); if (IS_ERR(usb_serial_tty_driver)) return PTR_ERR(usb_serial_tty_driver); /* Initialize our global data */ result = bus_register(&usb_serial_bus_type); if (result) { pr_err("%s - registering bus driver failed\n", __func__); goto err_put_driver; } usb_serial_tty_driver->driver_name = "usbserial"; usb_serial_tty_driver->name = "ttyUSB"; usb_serial_tty_driver->major = USB_SERIAL_TTY_MAJOR; usb_serial_tty_driver->minor_start = 0; usb_serial_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; usb_serial_tty_driver->subtype = SERIAL_TYPE_NORMAL; usb_serial_tty_driver->init_termios = tty_std_termios; usb_serial_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; usb_serial_tty_driver->init_termios.c_ispeed = 9600; usb_serial_tty_driver->init_termios.c_ospeed = 9600; tty_set_operations(usb_serial_tty_driver, &serial_ops); result = tty_register_driver(usb_serial_tty_driver); if (result) { pr_err("%s - tty_register_driver failed\n", __func__); goto err_unregister_bus; } /* register the generic driver, if we should */ result = usb_serial_generic_register(); if (result < 0) { pr_err("%s - registering generic driver failed\n", __func__); goto err_unregister_driver; } return result; err_unregister_driver: tty_unregister_driver(usb_serial_tty_driver); err_unregister_bus: bus_unregister(&usb_serial_bus_type); err_put_driver: pr_err("%s - returning with error %d\n", __func__, result); tty_driver_kref_put(usb_serial_tty_driver); return result; } static void __exit usb_serial_exit(void) { usb_serial_console_exit(); usb_serial_generic_deregister(); tty_unregister_driver(usb_serial_tty_driver); tty_driver_kref_put(usb_serial_tty_driver); bus_unregister(&usb_serial_bus_type); idr_destroy(&serial_minors); } module_init(usb_serial_init); module_exit(usb_serial_exit); #define set_to_generic_if_null(type, function) \ do { \ if (!type->function) { \ type->function = usb_serial_generic_##function; \ pr_debug("%s: using generic " #function "\n", \ type->driver.name); \ } \ } while (0) static void usb_serial_operations_init(struct usb_serial_driver *device) { set_to_generic_if_null(device, open); set_to_generic_if_null(device, write); set_to_generic_if_null(device, close); set_to_generic_if_null(device, write_room); set_to_generic_if_null(device, chars_in_buffer); if (device->tx_empty) set_to_generic_if_null(device, wait_until_sent); set_to_generic_if_null(device, read_bulk_callback); set_to_generic_if_null(device, write_bulk_callback); set_to_generic_if_null(device, process_read_urb); set_to_generic_if_null(device, prepare_write_buffer); } static int usb_serial_register(struct usb_serial_driver *driver) { int retval; if (usb_disabled()) return -ENODEV; if (!driver->description) driver->description = driver->driver.name; if (!driver->usb_driver) { WARN(1, "Serial driver %s has no usb_driver\n", driver->description); return -EINVAL; } /* Prevent individual ports from being unbound. */ driver->driver.suppress_bind_attrs = true; usb_serial_operations_init(driver); /* Add this device to our list of devices */ mutex_lock(&table_lock); list_add(&driver->driver_list, &usb_serial_driver_list); retval = usb_serial_bus_register(driver); if (retval) { pr_err("problem %d when registering driver %s\n", retval, driver->description); list_del(&driver->driver_list); } else { pr_info("USB Serial support registered for %s\n", driver->description); } mutex_unlock(&table_lock); return retval; } static void usb_serial_deregister(struct usb_serial_driver *device) { pr_info("USB Serial deregistering driver %s\n", device->description); mutex_lock(&table_lock); list_del(&device->driver_list); mutex_unlock(&table_lock); usb_serial_bus_deregister(device); } /** * __usb_serial_register_drivers - register drivers for a usb-serial module * @serial_drivers: NULL-terminated array of pointers to drivers to be registered * @owner: owning module * @name: name of the usb_driver for this set of @serial_drivers * @id_table: list of all devices this @serial_drivers set binds to * * Registers all the drivers in the @serial_drivers array, and dynamically * creates a struct usb_driver with the name @name and id_table of @id_table. */ int __usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[], struct module *owner, const char *name, const struct usb_device_id *id_table) { int rc; struct usb_driver *udriver; struct usb_serial_driver * const *sd; /* * udriver must be registered before any of the serial drivers, * because the store_new_id() routine for the serial drivers (in * bus.c) probes udriver. * * Performance hack: We don't want udriver to be probed until * the serial drivers are registered, because the probe would * simply fail for lack of a matching serial driver. * So we leave udriver's id_table set to NULL until we are all set. * * Suspend/resume support is implemented in the usb-serial core, * so fill in the PM-related fields in udriver. */ udriver = kzalloc(sizeof(*udriver), GFP_KERNEL); if (!udriver) return -ENOMEM; udriver->name = name; udriver->no_dynamic_id = 1; udriver->supports_autosuspend = 1; udriver->suspend = usb_serial_suspend; udriver->resume = usb_serial_resume; udriver->probe = usb_serial_probe; udriver->disconnect = usb_serial_disconnect; /* we only set the reset_resume field if the serial_driver has one */ for (sd = serial_drivers; *sd; ++sd) { if ((*sd)->reset_resume) { udriver->reset_resume = usb_serial_reset_resume; break; } } rc = usb_register(udriver); if (rc) goto err_free_driver; for (sd = serial_drivers; *sd; ++sd) { (*sd)->usb_driver = udriver; (*sd)->driver.owner = owner; rc = usb_serial_register(*sd); if (rc) goto err_deregister_drivers; } /* Now set udriver's id_table and look for matches */ udriver->id_table = id_table; rc = driver_attach(&udriver->driver); return 0; err_deregister_drivers: while (sd-- > serial_drivers) usb_serial_deregister(*sd); usb_deregister(udriver); err_free_driver: kfree(udriver); return rc; } EXPORT_SYMBOL_GPL(__usb_serial_register_drivers); /** * usb_serial_deregister_drivers - deregister drivers for a usb-serial module * @serial_drivers: NULL-terminated array of pointers to drivers to be deregistered * * Deregisters all the drivers in the @serial_drivers array and deregisters and * frees the struct usb_driver that was created by the call to * usb_serial_register_drivers(). */ void usb_serial_deregister_drivers(struct usb_serial_driver *const serial_drivers[]) { struct usb_driver *udriver = (*serial_drivers)->usb_driver; for (; *serial_drivers; ++serial_drivers) usb_serial_deregister(*serial_drivers); usb_deregister(udriver); kfree(udriver); } EXPORT_SYMBOL_GPL(usb_serial_deregister_drivers); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2");
1 5 5 4 4 3 3 3 3 1 3 1 2 2 1 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org> * Copyright (c) 2012 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/string.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_tables.h> #include <net/ip.h> struct nft_nat { u8 sreg_addr_min; u8 sreg_addr_max; u8 sreg_proto_min; u8 sreg_proto_max; enum nf_nat_manip_type type:8; u8 family; u16 flags; }; static void nft_nat_setup_addr(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { switch (priv->family) { case AF_INET: range->min_addr.ip = (__force __be32) regs->data[priv->sreg_addr_min]; range->max_addr.ip = (__force __be32) regs->data[priv->sreg_addr_max]; break; case AF_INET6: memcpy(range->min_addr.ip6, &regs->data[priv->sreg_addr_min], sizeof(range->min_addr.ip6)); memcpy(range->max_addr.ip6, &regs->data[priv->sreg_addr_max], sizeof(range->max_addr.ip6)); break; } } static void nft_nat_setup_proto(struct nf_nat_range2 *range, const struct nft_regs *regs, const struct nft_nat *priv) { range->min_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_min]); range->max_proto.all = (__force __be16) nft_reg_load16(&regs->data[priv->sreg_proto_max]); } static void nft_nat_setup_netmap(struct nf_nat_range2 *range, const struct nft_pktinfo *pkt, const struct nft_nat *priv) { struct sk_buff *skb = pkt->skb; union nf_inet_addr new_addr; __be32 netmask; int i, len = 0; switch (priv->type) { case NFT_NAT_SNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->saddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->saddr; len = sizeof(struct in6_addr); } break; case NFT_NAT_DNAT: if (nft_pf(pkt) == NFPROTO_IPV4) { new_addr.ip = ip_hdr(skb)->daddr; len = sizeof(struct in_addr); } else { new_addr.in6 = ipv6_hdr(skb)->daddr; len = sizeof(struct in6_addr); } break; } for (i = 0; i < len / sizeof(__be32); i++) { netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); new_addr.ip6[i] &= ~netmask; new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask; } range->min_addr = new_addr; range->max_addr = new_addr; } static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_addr_min) { nft_nat_setup_addr(&range, regs, priv); if (priv->flags & NF_NAT_RANGE_NETMAP) nft_nat_setup_netmap(&range, pkt, priv); } if (priv->sreg_proto_min) nft_nat_setup_proto(&range, regs, priv); range.flags = priv->flags; regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type); } static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_TYPE] = { .type = NLA_U32 }, [NFTA_NAT_FAMILY] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, [NFTA_NAT_FLAGS] = NLA_POLICY_MASK(NLA_BE32, NF_NAT_RANGE_MASK), }; static int nft_nat_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_nat *priv = nft_expr_priv(expr); int err; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; switch (priv->type) { case NFT_NAT_SNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN)); break; case NFT_NAT_DNAT: err = nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)); break; } return err; } static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_nat *priv = nft_expr_priv(expr); unsigned int alen, plen; u32 family; int err; if (tb[NFTA_NAT_TYPE] == NULL || (tb[NFTA_NAT_REG_ADDR_MIN] == NULL && tb[NFTA_NAT_REG_PROTO_MIN] == NULL)) return -EINVAL; switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) { case NFT_NAT_SNAT: priv->type = NF_NAT_MANIP_SRC; break; case NFT_NAT_DNAT: priv->type = NF_NAT_MANIP_DST; break; default: return -EOPNOTSUPP; } if (tb[NFTA_NAT_FAMILY] == NULL) return -EINVAL; family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY])); if (ctx->family != NFPROTO_INET && ctx->family != family) return -EOPNOTSUPP; switch (family) { case NFPROTO_IPV4: alen = sizeof_field(struct nf_nat_range, min_addr.ip); break; case NFPROTO_IPV6: alen = sizeof_field(struct nf_nat_range, min_addr.ip6); break; default: if (tb[NFTA_NAT_REG_ADDR_MIN]) return -EAFNOSUPPORT; break; } priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MIN], &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MAX], &priv->sreg_addr_max, alen); if (err < 0) return err; } else { priv->sreg_addr_max = priv->sreg_addr_min; } priv->flags |= NF_NAT_RANGE_MAP_IPS; } plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) return err; } else { priv->sreg_proto_max = priv->sreg_proto_min; } priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_NAT_FLAGS]) priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); return nf_ct_netns_get(ctx->net, family); } static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_nat *priv = nft_expr_priv(expr); switch (priv->type) { case NF_NAT_MANIP_SRC: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT))) goto nla_put_failure; break; case NF_NAT_MANIP_DST: if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT))) goto nla_put_failure; break; } if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family))) goto nla_put_failure; if (priv->sreg_addr_min) { if (nft_dump_register(skb, NFTA_NAT_REG_ADDR_MIN, priv->sreg_addr_min) || nft_dump_register(skb, NFTA_NAT_REG_ADDR_MAX, priv->sreg_addr_max)) goto nla_put_failure; } if (priv->sreg_proto_min) { if (nft_dump_register(skb, NFTA_NAT_REG_PROTO_MIN, priv->sreg_proto_min) || nft_dump_register(skb, NFTA_NAT_REG_PROTO_MAX, priv->sreg_proto_max)) goto nla_put_failure; } if (priv->flags != 0) { if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static void nft_nat_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_nat *priv = nft_expr_priv(expr); nf_ct_netns_put(ctx->net, priv->family); } static struct nft_expr_type nft_nat_type; static const struct nft_expr_ops nft_nat_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_nat_type __read_mostly = { .name = "nat", .ops = &nft_nat_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_TABLES_INET static void nft_nat_inet_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_nat *priv = nft_expr_priv(expr); if (priv->family == nft_pf(pkt) || priv->family == NFPROTO_INET) nft_nat_eval(expr, regs, pkt); } static const struct nft_expr_ops nft_nat_inet_ops = { .type = &nft_nat_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_nat)), .eval = nft_nat_inet_eval, .init = nft_nat_init, .destroy = nft_nat_destroy, .dump = nft_nat_dump, .validate = nft_nat_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_inet_nat_type __read_mostly = { .name = "nat", .family = NFPROTO_INET, .ops = &nft_nat_inet_ops, .policy = nft_nat_policy, .maxattr = NFTA_NAT_MAX, .owner = THIS_MODULE, }; static int nft_nat_inet_module_init(void) { return nft_register_expr(&nft_inet_nat_type); } static void nft_nat_inet_module_exit(void) { nft_unregister_expr(&nft_inet_nat_type); } #else static int nft_nat_inet_module_init(void) { return 0; } static void nft_nat_inet_module_exit(void) { } #endif static int __init nft_nat_module_init(void) { int ret = nft_nat_inet_module_init(); if (ret) return ret; ret = nft_register_expr(&nft_nat_type); if (ret) nft_nat_inet_module_exit(); return ret; } static void __exit nft_nat_module_exit(void) { nft_nat_inet_module_exit(); nft_unregister_expr(&nft_nat_type); } module_init(nft_nat_module_init); module_exit(nft_nat_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); MODULE_ALIAS_NFT_EXPR("nat"); MODULE_DESCRIPTION("Network Address Translation support");
4 4 4 4 3 4 5 5 4 2 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct debug_req_info { struct ethnl_req_info base; }; struct debug_reply_data { struct ethnl_reply_data base; u32 msg_mask; }; #define DEBUG_REPDATA(__reply_base) \ container_of(__reply_base, struct debug_reply_data, base) const struct nla_policy ethnl_debug_get_policy[] = { [ETHTOOL_A_DEBUG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int debug_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct debug_reply_data *data = DEBUG_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_msglevel) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->msg_mask = dev->ethtool_ops->get_msglevel(dev); ethnl_ops_complete(dev); return 0; } static int debug_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; return ethnl_bitset32_size(&data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, netif_msg_class_names, compact); } static int debug_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct debug_reply_data *data = DEBUG_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; return ethnl_put_bitset32(skb, ETHTOOL_A_DEBUG_MSGMASK, &data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT, netif_msg_class_names, compact); } /* DEBUG_SET */ const struct nla_policy ethnl_debug_set_policy[] = { [ETHTOOL_A_DEBUG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED }, }; static int ethnl_set_debug_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_msglevel && ops->set_msglevel ? 1 : -EOPNOTSUPP; } static int ethnl_set_debug(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; u32 msg_mask; int ret; msg_mask = dev->ethtool_ops->get_msglevel(dev); ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT, tb[ETHTOOL_A_DEBUG_MSGMASK], netif_msg_class_names, info->extack, &mod); if (ret < 0 || !mod) return ret; dev->ethtool_ops->set_msglevel(dev, msg_mask); return 1; } const struct ethnl_request_ops ethnl_debug_request_ops = { .request_cmd = ETHTOOL_MSG_DEBUG_GET, .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY, .hdr_attr = ETHTOOL_A_DEBUG_HEADER, .req_info_size = sizeof(struct debug_req_info), .reply_data_size = sizeof(struct debug_reply_data), .prepare_data = debug_prepare_data, .reply_size = debug_reply_size, .fill_reply = debug_fill_reply, .set_validate = ethnl_set_debug_validate, .set = ethnl_set_debug, .set_ntf_cmd = ETHTOOL_MSG_DEBUG_NTF, };
15 15 14 14 13 14 4 14 14 15 1 1 1 1 1 1 1 1 1 1 22 1 1 22 9 8 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 // SPDX-License-Identifier: GPL-2.0 /* * SUCS NET3: * * Generic stream handling routines. These are generic for most * protocols. Even IP. Tonight 8-). * This is used because TCP, LLC (others too) layer all have mostly * identical sendmsg() and recvmsg() code. * So we (will) share it here. * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * (from old tcp.c code) * Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-)) */ #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/net.h> #include <linux/signal.h> #include <linux/tcp.h> #include <linux/wait.h> #include <net/sock.h> /** * sk_stream_write_space - stream socket write_space callback. * @sk: pointer to the socket structure * * This function is invoked when there's space available in the socket's * send buffer for writing. It first checks if the socket is writable, * clears the SOCK_NOSPACE flag indicating that memory for writing * is now available, wakes up any processes waiting for write operations * and sends asynchronous notifications if needed. */ void sk_stream_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; struct socket_wq *wq; if (__sk_stream_is_writeable(sk, 1) && sock) { clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } /** * sk_stream_wait_connect - Wait for a socket to get into the connected state * @sk: sock to wait on * @timeo_p: for how long to wait * * Must be called with the socket locked. */ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct task_struct *tsk = current; int done; do { int err = sock_error(sk); if (err) return err; if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) return -EPIPE; if (!*timeo_p) return -EAGAIN; if (signal_pending(tsk)) return sock_intr_errno(*timeo_p); add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, !READ_ONCE(sk->sk_err) && !((1 << READ_ONCE(sk->sk_state)) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; } while (!done); return done < 0 ? done : 0; } EXPORT_SYMBOL(sk_stream_wait_connect); /** * sk_stream_closing - Return 1 if we still have things to send in our buffers. * @sk: socket to verify */ static int sk_stream_closing(const struct sock *sk) { return (1 << READ_ONCE(sk->sk_state)) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); } void sk_stream_wait_close(struct sock *sk, long timeout) { if (timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); do { if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk), &wait)) break; } while (!signal_pending(current) && timeout); remove_wait_queue(sk_sleep(sk), &wait); } } EXPORT_SYMBOL(sk_stream_wait_close); /** * sk_stream_wait_memory - Wait for more memory for a socket * @sk: socket to wait for memory * @timeo_p: for how long */ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) { int ret, err = 0; long vm_wait = 0; long current_timeo = *timeo_p; DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) current_timeo = vm_wait = get_random_u32_below(HZ / 5) + 2; add_wait_queue(sk_sleep(sk), &wait); while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; if (!*timeo_p) goto do_eagain; if (signal_pending(current)) goto do_interrupted; sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk_stream_memory_free(sk) && !vm_wait) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; ret = sk_wait_event(sk, &current_timeo, READ_ONCE(sk->sk_err) || (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || (sk_stream_memory_free(sk) && !vm_wait), &wait); sk->sk_write_pending--; if (ret < 0) goto do_error; if (vm_wait) { vm_wait -= current_timeo; current_timeo = *timeo_p; if (current_timeo != MAX_SCHEDULE_TIMEOUT && (current_timeo -= vm_wait) < 0) current_timeo = 0; vm_wait = 0; } *timeo_p = current_timeo; } out: if (!sock_flag(sk, SOCK_DEAD)) remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: err = -EPIPE; goto out; do_eagain: /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can * be generated later. * When TCP receives ACK packets that make room, tcp_check_space() * only calls tcp_new_space() if SOCK_NOSPACE is set. */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); err = -EAGAIN; goto out; do_interrupted: err = sock_intr_errno(*timeo_p); goto out; } EXPORT_SYMBOL(sk_stream_wait_memory); int sk_stream_error(struct sock *sk, int flags, int err) { if (err == -EPIPE) err = sock_error(sk) ? : -EPIPE; if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); return err; } EXPORT_SYMBOL(sk_stream_error); void sk_stream_kill_queues(struct sock *sk) { /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); /* Next, the error queue. * We need to use queue lock, because other threads might * add packets to the queue without socket lock being held. */ skb_queue_purge(&sk->sk_error_queue); /* Next, the write queue. */ WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue)); /* Account for returned memory. */ sk_mem_reclaim_final(sk); WARN_ON_ONCE(sk->sk_wmem_queued); /* It is _impossible_ for the backlog to contain anything * when we get here. All user references to this socket * have gone away, only the net layer knows can touch it. */ } EXPORT_SYMBOL(sk_stream_kill_queues);
3 2 297 297 297 1 3 4 3 293 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM writeback #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WRITEBACK_H #include <linux/tracepoint.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #define show_inode_state(state) \ __print_flags(state, "|", \ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ {I_NEW, "I_NEW"}, \ {I_WILL_FREE, "I_WILL_FREE"}, \ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ {I_REFERENCED, "I_REFERENCED"}, \ {I_LINKABLE, "I_LINKABLE"}, \ {I_WB_SWITCH, "I_WB_SWITCH"}, \ {I_OVL_INUSE, "I_OVL_INUSE"}, \ {I_CREATING, "I_CREATING"}, \ {I_DONTCACHE, "I_DONTCACHE"}, \ {I_SYNC_QUEUED, "I_SYNC_QUEUED"}, \ {I_PINNING_NETFS_WB, "I_PINNING_NETFS_WB"}, \ {I_LRU_ISOLATING, "I_LRU_ISOLATING"} \ ) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") WB_WORK_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_folio_template, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(pgoff_t, index) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = folio->index; ), TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, (unsigned long)__entry->ino, __entry->index ) ); DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DEFINE_EVENT(writeback_folio_template, folio_wait_writeback, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), TP_fast_assign( struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; __entry->state = inode_state_read_once(inode); __entry->flags = flags; ), TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return 1; } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CREATE_TRACE_POINTS */ #ifdef CONFIG_CGROUP_WRITEBACK TRACE_EVENT(inode_foreign_history, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned int history), TP_ARGS(inode, wbc, history), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, cgroup_ino) __field(unsigned int, history) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->history = history; ), TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->cgroup_ino, __entry->history ) ); TRACE_EVENT(inode_switch_wbs_queue, TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb, unsigned int count), TP_ARGS(old_wb, new_wb, count), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, old_cgroup_ino) __field(ino_t, new_cgroup_ino) __field(unsigned int, count) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); __entry->count = count; ), TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u", __entry->name, (unsigned long)__entry->old_cgroup_ino, (unsigned long)__entry->new_cgroup_ino, __entry->count ) ); TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb), TP_ARGS(inode, old_wb, new_wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, old_cgroup_ino) __field(ino_t, new_cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->ino = inode->i_ino; __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->old_cgroup_ino, (unsigned long)__entry->new_cgroup_ino ) ); TRACE_EVENT(track_foreign_dirty, TP_PROTO(struct folio *folio, struct bdi_writeback *wb), TP_ARGS(folio, wb), TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) __field(ino_t, ino) __field(unsigned int, memcg_id) __field(ino_t, cgroup_ino) __field(ino_t, page_cgroup_ino) ), TP_fast_assign( struct address_space *mapping = folio_mapping(folio); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->bdi_id = wb->bdi->id; __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, (unsigned long)__entry->ino, __entry->memcg_id, (unsigned long)__entry->cgroup_ino, (unsigned long)__entry->page_cgroup_ino ) ); TRACE_EVENT(flush_foreign, TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id, unsigned int frn_memcg_id), TP_ARGS(wb, frn_bdi_id, frn_memcg_id), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->frn_bdi_id = frn_bdi_id; __entry->frn_memcg_id = frn_memcg_id; ), TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) ); #endif DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(int, sync_mode) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, __entry->sync_mode, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) __field(dev_t, sb_dev) __field(int, sync_mode) __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); DEFINE_WRITEBACK_WORK_EVENT(writeback_written); DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), TP_ARGS(pages_written), TP_STRUCT__entry( __field(long, pages) ), TP_fast_assign( __entry->pages = pages_written; ), TP_printk("%ld", __entry->pages) ); DECLARE_EVENT_CLASS(writeback_class, TP_PROTO(struct bdi_writeback *wb), TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ TP_PROTO(struct bdi_writeback *wb), \ TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_wake_background); TRACE_EVENT(writeback_bdi_register, TP_PROTO(struct backing_dev_info *bdi), TP_ARGS(bdi), TP_STRUCT__entry( __array(char, name, 32) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); ), TP_printk("bdi %s", __entry->name ) ); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), TP_ARGS(wbc, bdi), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) __field(int, for_kupdate) __field(int, for_background) __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d bgrd=%d " "cyclic=%d start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->range_cyclic, __entry->range_start, __entry->range_end, (unsigned long)__entry->cgroup_ino ) ) #define DEFINE_WBC_EVENT(name) \ DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before, int moved), TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) __field(long, age) __field(int, moved) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->older = dirtied_before; __entry->age = (jiffies - dirtied_before) * 1000 / HZ; __entry->moved = moved; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* dirtied_before in jiffies */ __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(global_dirty_state, TP_PROTO(unsigned long background_thresh, unsigned long dirty_thresh ), TP_ARGS(background_thresh, dirty_thresh ), TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) __field(unsigned long, nr_dirtied) __field(unsigned long, nr_written) ), TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, __entry->nr_dirtied, __entry->nr_written ) ); #define KBps(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(bdi_dirty_ratelimit, TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) __field(unsigned long, write_bw) __field(unsigned long, avg_write_bw) __field(unsigned long, dirty_rate) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = KBps(wb->balanced_dirty_ratelimit); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, struct dirty_throttle_control *dtc, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, unsigned long period, long pause, unsigned long start_time), TP_ARGS(wb, dtc, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) __field(unsigned long, wb_setpoint) __field(unsigned long, wb_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) __field(unsigned long, period) __field( long, think) __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->limit = dtc->limit; __entry->setpoint = (dtc->limit + freerun) / 2; __entry->dirty = dtc->dirty; __entry->wb_setpoint = __entry->setpoint * dtc->wb_thresh / (dtc->thresh + 1); __entry->wb_dirty = dtc->wb_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; __entry->think = current->dirty_paused_when == 0 ? 0 : (long)(jiffies - current->dirty_paused_when) * 1000/HZ; __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " "wb_setpoint=%lu wb_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, __entry->dirty, __entry->wb_setpoint, __entry->wb_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(writeback_sb_inodes_requeue, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode_state_read_once(inode); __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, (unsigned long)__entry->cgroup_ino ) ); DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write ), TP_ARGS(inode, wbc, nr_to_write), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode_state_read_once(inode); __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DECLARE_EVENT_CLASS(writeback_inode_template, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->state = inode_state_read_once(inode); __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* * Inode writeback list tracking. */ DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 // SPDX-License-Identifier: GPL-2.0-or-later /* mpihelp-add_1.c - MPI helper functions * Copyright (C) 1994, 1996, 1997, 1998, * 2000 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" #include "longlong.h" mpi_limb_t mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_ptr_t s2_ptr, mpi_size_t size) { mpi_limb_t x, y, cy; mpi_size_t j; /* The loop counter and index J goes from -SIZE to -1. This way the loop becomes faster. */ j = -size; /* Offset the base pointers to compensate for the negative indices. */ s1_ptr -= j; s2_ptr -= j; res_ptr -= j; cy = 0; do { y = s2_ptr[j]; x = s1_ptr[j]; y += cy; /* add previous carry to one addend */ cy = y < cy; /* get out carry from that addition */ y += x; /* add other addend */ cy += y < x; /* get out carry from that add, combine */ res_ptr[j] = y; } while (++j); return cy; }
1109 1109 1115 162 388 16 16 16 16 12 4 4 16 16 16 4 16 388 388 5 5 381 162 232 332 333 329 25 333 331 331 221 221 221 6 219 219 1 1 1 1 1 1 1 1 1 1 1 1 1 92 91 92 91 92 92 90 388 90 5 100 99 47 389 125 385 1 1 386 387 327 60 389 329 10 385 100 387 101 382 388 389 69 389 389 389 450 273 273 213 272 232 219 61 60 1732 1729 1736 1740 1586 389 389 31 31 388 1 13 1773 1735 1743 1745 1590 1737 386 13 13 13 13 1 1 1 1 52 52 52 10 9 10 10 260 2 1 1 3 1 1 1 1 1 1 1 1 1 1 90 1 1 1 1 1 89 10 10 10 10 10 16 16 15 16 2 2 2 2 16 16 16 16 16 16 55 48 48 55 45 55 54 55 55 15 2 2 1 1 2 1 11 5 1 5 14 16 1 16 1 11 5 5 5 5 5 5 5 5 5 11 12 12 12 11 11 11 12 2247 2252 193 2201 12 12 2196 2248 1744 2 2259 2248 439 436 441 440 140 1474 1471 138 1467 14 5 314 12 315 314 315 315 310 315 2 312 310 310 310 310 309 5 5 2201 2203 407 408 406 2123 1403 1362 2082 2213 2002 2001 2010 2010 1580 1583 1578 1587 54 54 54 54 2 2 2 2 2 2 2 2 2 2 2 2 2 2 168 166 168 20 12 12 1 12 1162 12 1166 1109 1100 1105 1097 1164 817 810 795 808 883 892 892 232 166 167 167 892 3 3 1 3 1 1 2 2 3 828 827 832 829 828 1 403 402 402 402 402 405 406 407 403 407 406 407 406 407 407 406 407 407 407 407 406 407 407 406 406 406 404 405 406 9 820 401 822 310 821 333 332 333 493 402 818 332 818 487 488 456 453 455 14 455 2 2 2 2 1 2 2 2 2 2 1 1 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 2 1 2 2 2 2 2 11 11 335 335 336 331 259 336 337 27 28 28 9 9 9 9 9 28 2 2 2 2 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dcache.c * * Complete reimplementation * (C) 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ /* * Notes on the allocation strategy: * * The dcache is a master of the icache - whenever a dcache entry * exists, the inode will always exist. "iput()" is done either when * the dcache entry is deleted or garbage collected. */ #include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> #include "internal.h" #include "mount.h" #include <asm/runtime-const.h> /* * Usage: * dcache->d_inode->i_lock protects: * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: * - the s_roots list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags * - d_name * - d_lru * - d_count * - d_unhashed() * - d_parent and d_chilren * - childrens' d_sib and d_parent * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock * ... * dentry->d_parent->d_lock * dentry->d_lock * * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ static int sysctl_vfs_cache_pressure __read_mostly = 100; static int sysctl_vfs_cache_pressure_denom __read_mostly = 100; unsigned long vfs_pressure_ratio(unsigned long val) { return mult_frac(val, sysctl_vfs_cache_pressure, sysctl_vfs_cache_pressure_denom); } EXPORT_SYMBOL_GPL(vfs_pressure_ratio); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *__dentry_cache __ro_after_init; #define dentry_cache runtime_const_ptr(__dentry_cache) const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); const struct qstr dotdot_name = QSTR_INIT("..", 2); EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try * to make this good - I've just made it work. * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. * * Marking the variables "used" ensures that the compiler doesn't * optimize them away completely on architectures with runtime * constant infrastructure, this allows debuggers to see their * values. But updating these values has no effect on those arches. */ static unsigned int d_hash_shift __ro_after_init __used; static struct hlist_bl_head *dentry_hashtable __ro_after_init __used; static inline struct hlist_bl_head *d_hash(unsigned long hashlen) { return runtime_const_ptr(dentry_hashtable) + runtime_const_shift_right_32(hashlen, d_hash_shift); } #define IN_LOOKUP_SHIFT 10 static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } struct dentry_stat_t { long nr_dentry; long nr_unused; long age_limit; /* age in seconds */ long want_pages; /* pages requested by system */ long nr_negative; /* # of unused negative dentries */ long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ static struct dentry_stat_t dentry_stat = { .age_limit = 45, }; /* * Here we resort to our own counters instead of using generic per-cpu counters * for consistency with what the vfs inode code does. We are expected to harvest * better code and performance by having our own specialized counters. * * Please note that the loop is done over all possible CPUs, not over all online * CPUs. The reason for this is that we don't want to play games with CPUs going * on and off. If one of them goes off, we will just keep their counters. * * glommer: See cffbc8a for details, and if you ever intend to change this, * please update all vfs counters to match. */ static long get_nr_dentry(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_unused, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_negative(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_negative, i); return sum < 0 ? 0 : sum; } static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_dcache_sysctls[] = { { .procname = "dentry-state", .data = &dentry_stat, .maxlen = 6*sizeof(long), .mode = 0444, .proc_handler = proc_nr_dentry, }, { .procname = "dentry-negative", .data = &dentry_negative_policy, .maxlen = sizeof(dentry_negative_policy), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static const struct ctl_table vm_dcache_sysctls[] = { { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, { .procname = "vfs_cache_pressure_denom", .data = &sysctl_vfs_cache_pressure_denom, .maxlen = sizeof(sysctl_vfs_cache_pressure_denom), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE_HUNDRED, }, }; static int __init init_fs_dcache_sysctls(void) { register_sysctl_init("vm", vm_dcache_sysctls); register_sysctl_init("fs", fs_dcache_sysctls); return 0; } fs_initcall(init_fs_dcache_sysctls); #endif /* * Compare 2 name strings, return 0 if they match, otherwise non-zero. * The strings are both count bytes long, and count is non-zero. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> /* * NOTE! 'cs' and 'scount' come from a dentry, so it has a * aligned allocation for this particular component. We don't * strictly need the load_unaligned_zeropad() safety, but it * doesn't hurt either. * * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; for (;;) { a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; if (unlikely(a != b)) return 1; cs += sizeof(unsigned long); ct += sizeof(unsigned long); tcount -= sizeof(unsigned long); if (!tcount) return 0; } mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } #else static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { do { if (*cs != *ct) return 1; cs++; ct++; tcount--; } while (tcount); return 0; } #endif static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { /* * Be careful about RCU walk racing with rename: * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The * RCU walk will check the sequence count eventually, * and catch it. And we won't overrun the buffer, * because we're reading the name pointer atomically, * and a dentry name is guaranteed to be properly * terminated with a NUL byte. * * End result: even if 'len' is wrong, we'll exit * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } /* * long names are allocated separately from dentry and never modified. * Refcounted, freeing is RCU-delayed. See take_dentry_name_snapshot() * for the reason why ->count and ->head can't be combined into a union. * dentry_string_cmp() relies upon ->name[] being word-aligned. */ struct external_name { atomic_t count; struct rcu_head head; unsigned char name[] __aligned(sizeof(unsigned long)); }; static inline struct external_name *external_name(struct dentry *dentry) { return container_of(dentry->d_name.name, struct external_name, name[0]); } static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) { return dentry->d_name.name != dentry->d_shortname.string; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { unsigned seq; const unsigned char *s; rcu_read_lock(); retry: seq = read_seqcount_begin(&dentry->d_seq); s = READ_ONCE(dentry->d_name.name); name->name.hash_len = dentry->d_name.hash_len; name->name.name = name->inline_name.string; if (likely(s == dentry->d_shortname.string)) { name->inline_name = dentry->d_shortname; } else { struct external_name *p; p = container_of(s, struct external_name, name[0]); // get a valid reference if (unlikely(!atomic_inc_not_zero(&p->count))) goto retry; name->name.name = s; } if (read_seqcount_retry(&dentry->d_seq, seq)) { release_dentry_name_snapshot(name); goto retry; } rcu_read_unlock(); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { if (unlikely(name->name.name != name->inline_name.string)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->count))) kfree_rcu(p, head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) { unsigned flags; dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; /* * The negative counter only tracks dentries on the LRU. Don't inc if * d_lru is on another list. */ if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else iput(inode); } /* * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry * is in use - which includes both the "real" per-superblock * LRU list _and_ the DCACHE_SHRINK_LIST use. * * The DCACHE_SHRINK_LIST bit is set whenever the dentry is * on the shrink list (ie not on the superblock LRU list). * * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * * The per-cpu "nr_dentry_negative" counters are only updated * when deleted from or added to the per-superblock LRU list, not * from/to the shrink list. That is to avoid an unneeded dec/inc * pair when moving from LRU to shrink list in select_collect(). * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del_obj( &dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); this_cpu_dec(nr_dentry_unused); } static void d_shrink_add(struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, 0); list_add(&dentry->d_lru, list); dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); } /* * These can only be called under the global LRU lock, ie during the * callback for freeing the LRU list. "isolate" removes it from the * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } static void ___d_drop(struct dentry *dentry) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_root, which are always IS_ROOT: */ if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots; else b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); hlist_bl_unlock(b); } void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { ___d_drop(dentry); dentry->d_hash.pprev = NULL; write_seqcount_invalidate(&dentry->d_seq); } } EXPORT_SYMBOL(__d_drop); /** * d_drop - drop a dentry * @dentry: dentry to drop * * d_drop() unhashes the entry from the parent dentry hashes, so that it won't * be found through a VFS lookup any more. Note that this is different from * deleting the dentry - d_delete will try to mark the dentry negative if * possible, giving a successful _negative_ lookup, while d_drop will * just make the cache lookup fail. * * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * * __d_drop requires dentry->d_lock * * ___d_drop doesn't mark dentry as "unhashed" * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_drop); static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; /* * Inform d_walk() and shrink_dentry_list() that we are no longer * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been * a normal list member, it didn't matter - ->d_sib.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - * ->d_lock on parent prevents that and since a cursor has no children * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found * using the value left in its ->d_sib.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * * Solution: make sure that the pointer left behind in ->d_sib.next * points to something that won't be moving around. I.e. skip the * cursors. */ while (dentry->d_sib.next) { next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; dentry->d_sib.next = next->d_sib.next; } } static struct dentry *__dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; /* * The dentry is now unrecoverably dead to the world. */ lockref_mark_dead(&dentry->d_lockref); /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); if (dentry->d_inode) dentry_unlink_inode(dentry); else spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); cond_resched(); /* now that it's negative, ->d_parent is stable */ if (!IS_ROOT(dentry)) { parent = dentry->d_parent; spin_lock(&parent->d_lock); } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry_unlist(dentry); if (dentry->d_flags & DCACHE_SHRINK_LIST) can_free = false; spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); if (parent && --parent->d_lockref.count) { spin_unlock(&parent->d_lock); return NULL; } return parent; } /* * Lock a dentry for feeding it to __dentry_kill(). * Called under rcu_read_lock() and dentry->d_lock; the former * guarantees that nothing we access will be freed under us. * Note that dentry is *not* protected from concurrent dentry_kill(), * d_delete(), etc. * * Return false if dentry is busy. Otherwise, return true and have * that dentry's inode locked. */ static bool lock_for_kill(struct dentry *dentry) { struct inode *inode = dentry->d_inode; if (unlikely(dentry->d_lockref.count)) return false; if (!inode || likely(spin_trylock(&inode->i_lock))) return true; do { spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); if (likely(inode == dentry->d_inode)) break; spin_unlock(&inode->i_lock); inode = dentry->d_inode; } while (inode); if (likely(!dentry->d_lockref.count)) return true; if (inode) spin_unlock(&inode->i_lock); return false; } /* * Decide if dentry is worth retaining. Usually this is called with dentry * locked; if not locked, we are more limited and might not be able to tell * without a lock. False in this case means "punt to locked path and recheck". * * In case we aren't locked, these predicates are not "stable". However, it is * sufficient that at some point after we dropped the reference the dentry was * hashed and the flags had the proper value. Other dentry users may have * re-gotten a reference to the dentry and change that, but our work is done - * we can leave the dentry around with a zero refcount. */ static inline bool retain_dentry(struct dentry *dentry, bool locked) { unsigned int d_flags; smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); // Unreachable? Nobody would be able to look it up, no point retaining if (unlikely(d_unhashed(dentry))) return false; // Same if it's disconnected if (unlikely(d_flags & DCACHE_DISCONNECTED)) return false; // ->d_delete() might tell us not to bother, but that requires // ->d_lock; can't decide without it if (unlikely(d_flags & DCACHE_OP_DELETE)) { if (!locked || dentry->d_op->d_delete(dentry)) return false; } // Explicitly told not to bother if (unlikely(d_flags & DCACHE_DONTCACHE)) return false; // At this point it looks like we ought to keep it. We also might // need to do something - put it on LRU if it wasn't there already // and mark it referenced if it was on LRU, but not marked yet. // Unfortunately, both actions require ->d_lock, so in lockless // case we'd have to punt rather than doing those. if (unlikely(!(d_flags & DCACHE_LRU_LIST))) { if (!locked) return false; d_lru_add(dentry); } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) { if (!locked) return false; dentry->d_flags |= DCACHE_REFERENCED; } return true; } void d_mark_dontcache(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } inode_state_set(inode, I_DONTCACHE); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); /* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. * In that case refcount is guaranteed to be zero and we have already * decided that it's not worth keeping around. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; /* * try to decrement the lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); /* * If the lockref_put_return() failed due to the lock being held * by somebody else, the fast path has failed. We will need to * get the lock, and then check the count again. */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; } dentry->d_lockref.count--; goto locked; } /* * If we weren't the last ref, we're done. */ if (ret) return true; /* * Can we decide that decrement of refcount is all we needed without * taking the lock? There's a very common case when it's all we need - * dentry looks like it ought to be retained and there's nothing else * to do. */ if (retain_dentry(dentry, false)) return true; /* * Either not worth retaining or we can't tell without the lock. * Get the lock, then. We've already decremented the refcount to 0, * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); /* * Did somebody else grab a reference to it in the meantime, and * we're no longer the last user after all? Alternatively, somebody * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ locked: if (dentry->d_lockref.count || retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return true; } return false; } static void finish_dput(struct dentry *dentry) __releases(dentry->d_lock) __releases(RCU) { while (lock_for_kill(dentry)) { rcu_read_unlock(); dentry = __dentry_kill(dentry); if (!dentry) return; if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } rcu_read_lock(); } rcu_read_unlock(); spin_unlock(&dentry->d_lock); } /* * This is dput * * This is complicated by the fact that we do not want to put * dentries that are no longer on any hash chain on the unused * list: we'd much rather just get rid of them immediately. * * However, that implies that we have to traverse the dentry * tree upwards to the parents which might _also_ now be * scheduled for deletion (it may have been only waiting for * its last child to go away). * * This tail recursion is done by hand as we don't want to depend * on the compiler to always get this right (gcc generally doesn't). * Real recursion would eat up our stack space. */ /* * dput - release a dentry * @dentry: dentry to release * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. */ void dput(struct dentry *dentry) { if (!dentry) return; might_sleep(); rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } finish_dput(dentry); } EXPORT_SYMBOL(dput); void d_make_discardable(struct dentry *dentry) { spin_lock(&dentry->d_lock); WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT)); dentry->d_flags &= ~DCACHE_PERSISTENT; dentry->d_lockref.count--; rcu_read_lock(); finish_dput(dentry); } EXPORT_SYMBOL(d_make_discardable); static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); d_shrink_add(dentry, list); } } void dput_to_list(struct dentry *dentry, struct list_head *list) { rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } rcu_read_unlock(); to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } repeat: /* * Don't need rcu_dereference because we re-check it was correct under * the lock. */ rcu_read_lock(); ret = dentry->d_parent; spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); rcu_read_unlock(); goto repeat; } rcu_read_unlock(); BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); lockref_get(&alias->d_lockref); return alias; } /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for * * If any aliases exist for the given inode, take and return a * reference for one of them. If no aliases exist, return %NULL. */ struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); de = __d_find_any_alias(inode); spin_unlock(&inode->i_lock); return de; } EXPORT_SYMBOL(d_find_any_alias); static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias; if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); } return NULL; } /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root * of a filesystem, or if the directory was renamed and d_revalidate * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. */ struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; } EXPORT_SYMBOL(d_find_alias); /* * Caller MUST be holding rcu_read_lock() and be guaranteed * that inode won't get freed until rcu_read_unlock(). */ struct dentry *d_find_alias_rcu(struct inode *inode) { struct hlist_head *l = &inode->i_dentry; struct dentry *de = NULL; spin_lock(&inode->i_lock); // ->i_dentry and ->i_rcu are colocated, but the latter won't be // used without having I_FREEING set, which means no aliases left if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { de = hlist_entry(l->first, struct dentry, d_u.d_alias); } else { hlist_for_each_entry(de, l, d_u.d_alias) if (!d_unhashed(de)) break; } } spin_unlock(&inode->i_lock); return de; } void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) to_shrink_list(dentry, dispose); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_dispose_if_unused); /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { LIST_HEAD(dispose); struct dentry *dentry; spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) d_dispose_if_unused(dentry, &dispose); spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim) { do { rcu_read_unlock(); victim = __dentry_kill(victim); rcu_read_lock(); } while (victim && lock_for_kill(victim)); rcu_read_unlock(); if (victim) spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { struct dentry *dentry; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); if (!lock_for_kill(dentry)) { bool can_free; rcu_read_unlock(); d_shrink_del(dentry); can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } d_shrink_del(dentry); shrink_kill(dentry); } } EXPORT_SYMBOL(shrink_dentry_list); static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; /* * Referenced dentries are still in use. If they have active * counts, just remove them from the LRU. Otherwise give them * another pass through the LRU. */ if (dentry->d_lockref.count) { d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; spin_unlock(&dentry->d_lock); /* * The list move itself will be made by the common LRU code. At * this point, we've dropped the dentry->d_lock but keep the * lru lock. This is safe to do, since every list movement is * protected by the lru lock even if both locks are held. * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like * list_lru_add_obj and list_lru_del_obj. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * * The only exceptions to this are functions like * shrink_dentry_list, and code that first checks for the * DCACHE_SHRINK_LIST flag. Those are guaranteed to be * operating only with stack provided lists after they are * properly isolated from the main list. It is thus, always a * local access. */ return LRU_ROTATE; } d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock * @sc: shrink control, passed to list_lru_shrink_walk() * * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * * Shrink the dcache for the specified super block. This is used to free * the dcache before unmounting a file system. */ void shrink_dcache_sb(struct super_block *sb) { do { LIST_HEAD(dispose); list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk * @D_WALK_QUIT: quit walk * @D_WALK_NORETRY: quit when retry is needed * @D_WALK_SKIP: skip this dentry and its children */ enum d_walk_ret { D_WALK_CONTINUE, D_WALK_QUIT, D_WALK_NORETRY, D_WALK_SKIP, }; /** * d_walk - walk the dentry tree * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry * * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent, *dentry; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; again: read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); ret = enter(data, this_parent); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: case D_WALK_SKIP: goto out_unlock; case D_WALK_NORETRY: retry = false; break; } repeat: dentry = d_first_child(this_parent); resume: hlist_for_each_entry_from(dentry, d_sib) { if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: spin_unlock(&dentry->d_lock); goto out_unlock; case D_WALK_NORETRY: retry = false; break; case D_WALK_SKIP: spin_unlock(&dentry->d_lock); continue; } if (!hlist_empty(&dentry->d_children)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ rcu_read_lock(); ascend: if (this_parent != parent) { dentry = this_parent; this_parent = dentry->d_parent; spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ hlist_for_each_entry_continue(dentry, d_sib) { if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { rcu_read_unlock(); goto resume; } } goto ascend; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); out_unlock: spin_unlock(&this_parent->d_lock); done_seqretry(&rename_lock, seq); return; rename_retry: spin_unlock(&this_parent->d_lock); rcu_read_unlock(); BUG_ON(seq & 1); if (!retry) return; seq = 1; goto again; } struct check_mount { struct vfsmount *mnt; unsigned int mounted; }; /* locks: mount_locked_reader && dentry->d_lock */ static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; struct path path = { .mnt = info->mnt, .dentry = dentry }; if (likely(!d_mountpoint(dentry))) return D_WALK_CONTINUE; if (__path_is_mountpoint(&path)) { info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * path_has_submounts - check for mounts over a dentry in the * current namespace. * @parent: path to check. * * Return true if the parent or its subdirectories contain * a mount point in the current namespace. */ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; guard(mount_locked_reader)(); d_walk(parent->dentry, &data, path_check_mount); return data.mounted; } EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is * reachable (e.g. NFS can unhash a directory dentry and then the complete * subtree can become unreachable). * * Only one of d_invalidate() and d_set_mounted() must succeed. For * this reason take rename_lock and d_lock on dentry and ancestors. */ int d_set_mounted(struct dentry *dentry) { struct dentry *p; int ret = -ENOENT; read_seqlock_excl(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { /* Need exclusion wrt. d_invalidate() */ spin_lock(&p->d_lock); if (unlikely(d_unhashed(p))) { spin_unlock(&p->d_lock); goto out; } spin_unlock(&p->d_lock); } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { ret = -EBUSY; if (!d_mountpoint(dentry)) { dentry->d_flags |= DCACHE_MOUNTED; ret = 0; } } spin_unlock(&dentry->d_lock); out: read_sequnlock_excl(&rename_lock); return ret; } /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level * whenever the d_children list is non-empty and continue * searching. * * It returns zero iff there are no unused children, * otherwise it returns the number of children moved to * the end of the unused list. This may not be the total * number of unused children, because select_parent can * drop the lock and return early due to latency * constraints. */ struct select_data { struct dentry *start; union { long found; struct dentry *victim; }; struct list_head dispose; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; } else if (!dentry->d_lockref.count) { to_shrink_list(dentry, &data->dispose); data->found++; } else if (dentry->d_lockref.count < 0) { data->found++; } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } static enum d_walk_ret select_collect_umount(void *_data, struct dentry *dentry) { if (dentry->d_flags & DCACHE_PERSISTENT) { dentry->d_flags &= ~DCACHE_PERSISTENT; dentry->d_lockref.count--; } return select_collect(_data, dentry); } static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (!dentry->d_lockref.count) { if (dentry->d_flags & DCACHE_SHRINK_LIST) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } to_shrink_list(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } /** * shrink_dcache_tree - prune dcache * @parent: parent of entries to prune * @for_umount: true if we want to unpin the persistent ones * * Prune the dcache to remove unused children of the parent dentry. */ static void shrink_dcache_tree(struct dentry *parent, bool for_umount) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); d_walk(parent, &data, for_umount ? select_collect_umount : select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); continue; } cond_resched(); if (!data.found) break; data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { spin_lock(&data.victim->d_lock); if (!lock_for_kill(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { shrink_kill(data.victim); } } if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); } } void shrink_dcache_parent(struct dentry *parent) { shrink_dcache_tree(parent, false); } EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ if (!hlist_empty(&dentry->d_children)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); return D_WALK_CONTINUE; } static void do_one_tree(struct dentry *dentry) { shrink_dcache_tree(dentry, true); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } /* * destroy the dentries attached to a superblock on unmounting */ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; rwsem_assert_held_write(&sb->s_umount); dentry = sb->s_root; sb->s_root = NULL; do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_roots)) { dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); do_one_tree(dentry); } } static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { *victim = dget_dlock(dentry); return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) */ void d_invalidate(struct dentry *dentry) { bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ if (!dentry->d_inode) return; shrink_dcache_parent(dentry); for (;;) { struct dentry *victim = NULL; d_walk(dentry, &victim, find_submount); if (!victim) { if (had_submounts) shrink_dcache_parent(dentry); return; } had_submounts = true; detach_mounts(victim); dput(victim); } } EXPORT_SYMBOL(d_invalidate); /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; int err; dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru, GFP_KERNEL); if (!dentry) return NULL; /* * We guarantee that the inline name is always NUL-terminated. * This way the memcpy() done by the name switching in rename * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0; if (unlikely(!name)) { name = &slash_name; dname = dentry->d_shortname.string; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } atomic_set(&p->count, 1); dname = p->name; } else { dname = dentry->d_shortname.string; } dentry->__d_name.len = name->len; dentry->__d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->__d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; lockref_init(&dentry->d_lockref); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; dentry->d_op = sb->__s_d_op; dentry->d_flags = sb->s_d_flags; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_HLIST_NODE(&dentry->d_sib); if (dentry->d_op && dentry->d_op->d_init) { err = dentry->d_op->d_init(dentry); if (err) { if (dname_external(dentry)) kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); return NULL; } } this_cpu_inc(nr_dentry); return dentry; } /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject * to concurrency here */ dentry->d_parent = dget_dlock(parent); hlist_add_head(&dentry->d_sib, &parent->d_children); spin_unlock(&parent->d_lock); return dentry; } EXPORT_SYMBOL(d_alloc); struct dentry *d_alloc_anon(struct super_block *sb) { return __d_alloc(sb, NULL); } EXPORT_SYMBOL(d_alloc_anon); struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } return dentry; } /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock * @name: qstr of the name * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. * This is used for pipes, sockets et.al. - the stuff that should * never be anyone's children or parents. Unlike all other * dentries, these will not have RCU delay between dropping the * last reference and freeing them. * * The only user is alloc_file_pseudo() and that's what should * be considered a public interface. Don't use directly. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; struct dentry *dentry = __d_alloc(sb, name); if (likely(dentry)) { dentry->d_flags |= DCACHE_NORCU; /* d_op_flags(&anon_ops) is 0 */ if (!dentry->d_op) dentry->d_op = &anon_ops; } return dentry; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; q.name = name; q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); #define DCACHE_OP_FLAGS \ (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | \ DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_PRUNE | \ DCACHE_OP_REAL) static unsigned int d_op_flags(const struct dentry_operations *op) { unsigned int flags = 0; if (op) { if (op->d_hash) flags |= DCACHE_OP_HASH; if (op->d_compare) flags |= DCACHE_OP_COMPARE; if (op->d_revalidate) flags |= DCACHE_OP_REVALIDATE; if (op->d_weak_revalidate) flags |= DCACHE_OP_WEAK_REVALIDATE; if (op->d_delete) flags |= DCACHE_OP_DELETE; if (op->d_prune) flags |= DCACHE_OP_PRUNE; if (op->d_real) flags |= DCACHE_OP_REAL; } return flags; } static void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) { unsigned int flags = d_op_flags(op); WARN_ON_ONCE(dentry->d_op); WARN_ON_ONCE(dentry->d_flags & DCACHE_OP_FLAGS); dentry->d_op = op; if (flags) dentry->d_flags |= flags; } void set_default_d_op(struct super_block *s, const struct dentry_operations *ops) { unsigned int flags = d_op_flags(ops); s->__s_d_op = ops; s->s_d_flags = (s->s_d_flags & ~DCACHE_OP_FLAGS) | flags; } EXPORT_SYMBOL(set_default_d_op); static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; if (S_ISDIR(inode->i_mode)) { add_flags = DCACHE_DIRECTORY_TYPE; if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { if (unlikely(!inode->i_op->lookup)) add_flags = DCACHE_AUTODIR_TYPE; else inode->i_opflags |= IOP_LOOKUP; } goto type_determined; } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { if (unlikely(inode->i_op->get_link)) { add_flags = DCACHE_SYMLINK_TYPE; goto type_determined; } inode->i_opflags |= IOP_NOFOLLOW; } if (unlikely(!S_ISREG(inode->i_mode))) add_flags = DCACHE_SPECIAL_TYPE; type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; return add_flags; } static void __d_instantiate(struct dentry *dentry, struct inode *inode) { unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); /* * The negative counter only tracks dentries on the LRU. Don't dec if * d_lru is on another list. */ if ((dentry->d_flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); } /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete * @inode: inode to attach to this dentry * * Fill in inode information in the entry. * * This turns negative dentries into productive full members * of society. * * NOTE! This assumes that the inode count has been incremented * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); spin_lock(&entry->d_lock); __d_instantiate(entry, inode); spin_unlock(&entry->d_lock); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_instantiate); /* * This should be equivalent to d_instantiate() + unlock_new_inode(), * with lockdep-related part of unlock_new_inode() done before * anything else. Use that instead of open-coding d_instantiate()/ * unlock_new_inode() combinations. */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); spin_lock(&entry->d_lock); __d_instantiate(entry, inode); spin_unlock(&entry->d_lock); WARN_ON(!(inode_state_read(inode) & I_NEW)); inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { res = d_alloc_anon(root_inode->i_sb); if (res) d_instantiate(res, root_inode); else iput(root_inode); } return res; } EXPORT_SYMBOL(d_make_root); static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { struct super_block *sb; struct dentry *new, *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); sb = inode->i_sb; res = d_find_any_alias(inode); /* existing alias? */ if (res) goto out; new = d_alloc_anon(sb); if (!new) { res = ERR_PTR(-ENOMEM); goto out; } security_d_instantiate(new, inode); spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); /* recheck under lock */ if (likely(!res)) { /* still no alias, attach a disconnected dentry */ unsigned add_flags = d_flags_for_inode(inode); if (disconnected) add_flags |= DCACHE_DISCONNECTED; spin_lock(&new->d_lock); __d_set_inode_and_type(new, inode, add_flags); hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); if (!disconnected) { hlist_bl_lock(&sb->s_roots); hlist_bl_add_head(&new->d_hash, &sb->s_roots); hlist_bl_unlock(&sb->s_roots); } spin_unlock(&new->d_lock); spin_unlock(&inode->i_lock); inode = NULL; /* consumed by new->d_inode */ res = new; } else { spin_unlock(&inode->i_lock); dput(new); } out: iput(inode); return res; } /** * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain a dentry for an inode resulting from NFS filehandle conversion or * similar open by handle operations. The returned dentry may be anonymous, * or may have a full name (if the inode was already in the cache). * * When called on a directory inode, we must ensure that the inode only ever * has one dentry. If a dentry is found, that is returned instead of * allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is released. * To make it easier to use in export operations a %NULL or IS_ERR inode may * be passed in and the error will be propagated to the return value, * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_alias(struct inode *inode) { return __d_obtain_alias(inode, true); } EXPORT_SYMBOL(d_obtain_alias); /** * d_obtain_root - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain an IS_ROOT dentry for the root of a filesystem. * * We must ensure that directory inodes only ever have one dentry. If a * dentry is found, that is returned instead of allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is * released. A %NULL or IS_ERR inode may be passed in and will be the * error will be propagate to the return value, with a %NULL @inode * replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_root(struct inode *inode) { return __d_obtain_alias(inode, false); } EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name * @dentry: the negative dentry that was passed to the parent's lookup func * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * * For a case-insensitive lookup match and if the case-exact dentry * already exists in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { struct dentry *found, *res; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); if (found) { iput(inode); return found; } if (d_in_lookup(dentry)) { found = d_alloc_parallel(dentry->d_parent, name, dentry->d_wait); if (IS_ERR(found) || !d_in_lookup(found)) { iput(inode); return found; } } else { found = d_alloc(dentry->d_parent, name); if (!found) { iput(inode); return ERR_PTR(-ENOMEM); } } res = d_splice_alias(inode, found); if (res) { d_lookup_done(found); dput(found); return res; } return found; } EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name * @dentry: the negative dentry that was passed to the parent's lookup func * @parent: parent dentry * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false */ bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name) { if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { if (dentry->d_name.len != name->len) return false; return dentry_cmp(dentry, name->name, name->len) == 0; } return parent->d_op->d_compare(dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } EXPORT_SYMBOL_GPL(d_same_name); /* * This is __d_lookup_rcu() when the parent dentry has * DCACHE_OP_COMPARE, which makes things much nastier. */ static noinline struct dentry *__d_lookup_rcu_op_compare( const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { int tlen; const char *tname; unsigned seq; seqretry: seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; tlen = dentry->d_name.len; tname = dentry->d_name.name; /* we want a consistent (name,len) pair */ if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); goto seqretry; } if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) continue; *seqp = seq; return dentry; } return NULL; } /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name * resolution (store-free path walking) design described in * Documentation/filesystems/path-lookup.txt. * * This is not to be used outside core vfs. * * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock * held, and rcu_read_lock held. The returned dentry must not be stored into * without taking d_lock and checking d_seq sequence count against @seq * returned here. * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. * * NOTE! The caller *has* to check the resulting dentry against the sequence * number we've returned before using any of the resulting dentry state! */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; struct hlist_bl_head *b = d_hash(hashlen); struct hlist_bl_node *node; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) return __d_lookup_rcu_op_compare(parent, name, seqp); /* * The hash list is protected using RCU. * * Carefully use d_seq when comparing a candidate dentry, to avoid * races with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it * is in the middle of a sequence change. If we do the slow * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. * * Note that raw_seqcount_begin still *does* smp_rmb(), so * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (dentry->d_name.hash_len != hashlen) continue; if (unlikely(dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)) continue; /* * Check for the dentry being unhashed. * * As tempting as it is, we *can't* skip it because of a race window * between us finding the dentry before it gets unhashed and loading * the sequence counter after unhashing is finished. * * We can at least predict on it. */ if (unlikely(d_unhashed(dentry))) continue; *seqp = seq; return dentry; } return NULL; } /** * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * d_lookup searches the children of the parent dentry for the name in * question. If the dentry is found its reference count is incremented and the * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; do { seq = read_seqbegin(&rename_lock); dentry = __d_lookup(parent, name); if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; } EXPORT_SYMBOL(d_lookup); /** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * __d_lookup is like d_lookup, however it may (rarely) return a * false-negative result due to unrelated rename activity. * * __d_lookup is slightly faster by avoiding rename_lock read seqlock, * however it must be used carefully, eg. with a following d_lookup in * the case of failure. * * __d_lookup callers must be commented. */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int hash = name->hash; struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races * with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { if (dentry->d_name.hash != hash) continue; spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; if (d_unhashed(dentry)) goto next; if (!d_same_name(dentry, parent, name)) goto next; dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; next: spin_unlock(&dentry->d_lock); } rcu_read_unlock(); return found; } /** * d_hash_and_lookup - hash the qstr then search for a dentry * @dir: Directory to search in * @name: qstr of name we wish to find * * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) */ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) { /* * Check for a fs-specific hash function. Note that we must * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } return d_lookup(dir, name); } /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry * - unhash this dentry and free it. * * Usually, we want to just turn this into * a negative dentry, but if anybody else is * currently using the dentry or the inode * we can't do that and we fall back on removing * it from the hash queues and waiting for * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * * Turn the dentry into a negative dentry if possible, otherwise * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) { struct inode *inode = dentry->d_inode; spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); /* * Are we the only user? */ if (dentry->d_lockref.count == 1) { if (dentry_negative_policy) __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { struct hlist_bl_head *b = d_hash(entry->d_name.hash); hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash * * Adds a dentry to the hash according to its name. */ void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); __d_rehash(entry); spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { unsigned n = READ_ONCE(dir->i_dir_seq); if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1)) return n; cpu_relax(); } } static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); if (wq_has_sleeper(d_wait)) wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) { if (d_in_lookup(dentry)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(dentry->d_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&dentry->d_lock); schedule(); spin_lock(&dentry->d_lock); } while (d_in_lookup(dentry)); } } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, wait_queue_head_t *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; struct dentry *new = __d_alloc(parent->d_sb, name); struct dentry *dentry; unsigned seq, r_seq, d_seq; if (unlikely(!new)) return ERR_PTR(-ENOMEM); new->d_flags |= DCACHE_PAR_LOOKUP; spin_lock(&parent->d_lock); new->d_parent = dget_dlock(parent); hlist_add_head(&new->d_sib, &parent->d_children); if (parent->d_flags & DCACHE_DISCONNECTED) new->d_flags |= DCACHE_DISCONNECTED; spin_unlock(&parent->d_lock); retry: rcu_read_lock(); seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } if (read_seqcount_retry(&dentry->d_seq, d_seq)) { rcu_read_unlock(); dput(dentry); goto retry; } rcu_read_unlock(); dput(new); return dentry; } if (unlikely(read_seqretry(&rename_lock, r_seq))) { rcu_read_unlock(); goto retry; } if (unlikely(seq & 1)) { rcu_read_unlock(); goto retry; } hlist_bl_lock(b); if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; } /* * No changes for the parent since the beginning of d_lookup(). * Since all removals from the chain happen with hlist_bl_lock(), * any potential in-lookup matches are going to stay here until * we unlock the chain. All fields are stable in everything * we encounter. */ hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) continue; if (!d_same_name(dentry, parent, name)) continue; hlist_bl_unlock(b); /* now we can try to grab a reference */ if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } rcu_read_unlock(); /* * somebody is likely to be still doing lookup for it; * wait for them to finish */ spin_lock(&dentry->d_lock); d_wait_lookup(dentry); /* * it's not in-lookup anymore; in principle we should repeat * everything from dcache lookup, but it's likely to be what * d_lookup() would've found anyway. If it is, just return it; * otherwise we really have to repeat the whole thing. */ if (unlikely(dentry->d_name.hash != hash)) goto mismatch; if (unlikely(dentry->d_parent != parent)) goto mismatch; if (unlikely(d_unhashed(dentry))) goto mismatch; if (unlikely(!d_same_name(dentry, parent, name))) goto mismatch; /* OK, it *is* a hashed match; return it */ spin_unlock(&dentry->d_lock); dput(new); return dentry; } rcu_read_unlock(); new->d_wait = wq; hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: spin_unlock(&dentry->d_lock); dput(dentry); goto retry; } EXPORT_SYMBOL(d_alloc_parallel); /* * - Unhash the dentry * - Retrieve and clear the waitqueue head in dentry * - Return the waitqueue head */ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) { wait_queue_head_t *d_wait; struct hlist_bl_head *b; lockdep_assert_held(&dentry->d_lock); b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } void __d_lookup_unhash_wake(struct dentry *dentry) { spin_lock(&dentry->d_lock); wake_up_all(__d_lookup_unhash(dentry)); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(__d_lookup_unhash_wake); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode, const struct dentry_operations *ops) { wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(dentry); } if (unlikely(ops)) d_set_d_op(dentry, ops); if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); } __d_rehash(dentry); if (dir) end_dir_add(dir, n, d_wait); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); } /** * d_add - add dentry to hash queues * @entry: dentry to add * @inode: The inode to attach to this dentry * * This adds the entry to the hash queues and initializes @inode. * The entry was actually filled in earlier during d_alloc(). */ void d_add(struct dentry *entry, struct inode *inode) { if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); } __d_add(entry, inode, NULL); } EXPORT_SYMBOL(d_add); struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); WARN_ON(!inode); security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); __d_instantiate(dentry, inode); dentry->d_flags |= DCACHE_PERSISTENT; dget_dlock(dentry); if (d_unhashed(dentry)) __d_rehash(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); return dentry; } EXPORT_SYMBOL(d_make_persistent); static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { if (unlikely(dname_external(dentry))) { /* * Both external: swap the pointers */ swap(target->__d_name.name, dentry->__d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ dentry->__d_name.name = target->__d_name.name; target->d_shortname = dentry->d_shortname; target->__d_name.name = target->d_shortname.string; } } else { if (unlikely(dname_external(dentry))) { /* * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ target->__d_name.name = dentry->__d_name.name; dentry->d_shortname = target->d_shortname; dentry->__d_name.name = dentry->d_shortname.string; } else { /* * Both are internal. */ for (int i = 0; i < DNAME_INLINE_WORDS; i++) swap(dentry->d_shortname.words[i], target->d_shortname.words[i]); } } swap(dentry->__d_name.hash_len, target->__d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) { struct external_name *old_name = NULL; if (unlikely(dname_external(dentry))) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->count); dentry->__d_name = target->__d_name; } else { dentry->d_shortname = target->d_shortname; dentry->__d_name.name = dentry->d_shortname.string; dentry->__d_name.hash_len = target->__d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->count))) kfree_rcu(old_name, head); } /* * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative dcache * entries should not be moved in this way. Caller must hold rename_lock, the * i_rwsem of the source and target directories (exclusively), and the sb-> * s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; WARN_ON(!dentry->d_inode); if (WARN_ON(dentry == target)) return; BUG_ON(d_ancestor(target, dentry)); old_parent = dentry->d_parent; p = d_ancestor(old_parent, target); if (IS_ROOT(dentry)) { BUG_ON(p); spin_lock(&target->d_parent->d_lock); } else if (!p) { /* target is not a descendent of dentry->d_parent */ spin_lock(&target->d_parent->d_lock); spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); } else { BUG_ON(p == dentry); spin_lock(&old_parent->d_lock); if (p != target) spin_lock_nested(&target->d_parent->d_lock, DENTRY_D_LOCK_NESTED); } spin_lock_nested(&dentry->d_lock, 2); spin_lock_nested(&target->d_lock, 3); if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); d_wait = __d_lookup_unhash(target); } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* unhash both */ if (!d_unhashed(dentry)) ___d_drop(dentry); if (!d_unhashed(target)) ___d_drop(target); /* ... and switch them in the tree */ dentry->d_parent = target->d_parent; if (!exchange) { copy_name(dentry, target); target->d_hash.pprev = NULL; dentry->d_parent->d_lockref.count++; if (dentry != old_parent) /* wasn't IS_ROOT */ WARN_ON(!--old_parent->d_lockref.count); } else { target->d_parent = old_parent; swap_names(dentry, target); if (!hlist_unhashed(&target->d_sib)) __hlist_del(&target->d_sib); hlist_add_head(&target->d_sib, &target->d_parent->d_children); __d_rehash(target); fsnotify_update_flags(target); } if (!hlist_unhashed(&dentry->d_sib)) __hlist_del(&dentry->d_sib); hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); if (dir) end_dir_add(dir, n, d_wait); if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); if (dentry != old_parent) spin_unlock(&old_parent->d_lock); spin_unlock(&target->d_lock); spin_unlock(&dentry->d_lock); } /* * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. See the locking * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); /* * d_exchange - exchange two dentries * @dentry1: first dentry * @dentry2: second dentry */ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) { write_seqlock(&rename_lock); WARN_ON(!dentry1->d_inode); WARN_ON(!dentry2->d_inode); WARN_ON(IS_ROOT(dentry1)); WARN_ON(IS_ROOT(dentry2)); __d_move(dentry1, dentry2, true); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_exchange); /** * d_ancestor - search for an ancestor * @p1: ancestor dentry * @p2: child dentry * * Returns the ancestor dentry of p2 which is a child of p1, if p1 is * an ancestor of p2, else NULL. */ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) { struct dentry *p; for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1) return p; } return NULL; } /* * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding * dentry->d_parent->d_inode->i_rwsem, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static int __d_unalias(struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) goto out_unalias; /* See lock_rename() */ if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: if (alias->d_op && alias->d_op->d_unalias_trylock && !alias->d_op->d_unalias_trylock(alias)) goto out_err; __d_move(alias, dentry, false); if (alias->d_op && alias->d_op->d_unalias_unlock) alias->d_op->d_unalias_unlock(alias); ret = 0; out_err: if (m2) up_read(m2); if (m1) mutex_unlock(m1); return ret; } struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry, const struct dentry_operations *ops) { if (IS_ERR(inode)) return ERR_CAST(inode); BUG_ON(!d_unhashed(dentry)); if (!inode) goto out; security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { /* The reference to new ensures it remains an alias */ spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( "VFS: Lookup of '%s' in %s %s" " would have caused loop\n", dentry->d_name.name, inode->i_sb->s_type->name, inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); int err = __d_unalias(dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); new = ERR_PTR(err); } dput(old_parent); } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); } iput(inode); return new; } } out: __d_add(dentry, inode, ops); return NULL; } /** * d_splice_alias - splice a disconnected dentry into the tree if one exists * @inode: the inode which may have a disconnected dentry * @dentry: a negative dentry which we want to point to the inode. * * If inode is a directory and has an IS_ROOT alias, then d_move that in * place of the given dentry and return it, else simply d_add the inode * to the dentry and return NULL. * * If a non-IS_ROOT directory is found, the filesystem is corrupt, and * we should error out: directories can't have multiple aliases. * * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * * Cluster filesystems may call this function with a negative, hashed dentry. * In that case, we know that the inode will be a regular file, and also this * will only occur during atomic_open. So we need to check for the dentry * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { return d_splice_alias_ops(inode, dentry, NULL); } EXPORT_SYMBOL(d_splice_alias); /* * Test whether new_dentry is a subdirectory of old_dentry. * * Trivially implemented using the dcache structure */ /** * is_subdir - is new dentry a subdirectory of old_dentry * @new_dentry: new dentry * @old_dentry: old dentry * * Returns true if new_dentry is a subdirectory of the parent (at any depth). * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { bool subdir; unsigned seq; if (new_dentry == old_dentry) return true; /* Access d_parent under rcu as d_move() may change it. */ rcu_read_lock(); seq = read_seqbegin(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); /* Try lockless once... */ if (read_seqretry(&rename_lock, seq)) { /* ...else acquire lock for progress even on deep chains. */ read_seqlock_excl(&rename_lock); subdir = d_ancestor(old_dentry, new_dentry); read_sequnlock_excl(&rename_lock); } rcu_read_unlock(); return subdir; } EXPORT_SYMBOL(is_subdir); void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; BUG_ON(dname_external(dentry) || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry->__d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); } EXPORT_SYMBOL(d_mark_tmpfile); void d_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; inode_dec_link_count(inode); d_mark_tmpfile(file, inode); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); /* * Obtain inode number of the parent dentry. */ ino_t d_parent_ino(struct dentry *dentry) { struct dentry *parent; struct inode *iparent; unsigned seq; ino_t ret; scoped_guard(rcu) { seq = raw_seqcount_begin(&dentry->d_seq); parent = READ_ONCE(dentry->d_parent); iparent = d_inode_rcu(parent); if (likely(iparent)) { ret = iparent->i_ino; if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; } } spin_lock(&dentry->d_lock); ret = dentry->d_parent->d_inode->i_ino; spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(d_parent_ino); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { if (!str) return 0; dhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_EARLY | HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } static void __init dcache_init(void) { /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature * of the dcache. */ __dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_shortname.string); runtime_const_init(ptr, __dentry_cache); /* Hash may have been set up in dcache_init_early */ if (!hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; runtime_const_init(shift, d_hash_shift); runtime_const_init(ptr, dentry_hashtable); } /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __ro_after_init; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) { int i; for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); dcache_init_early(); inode_init_early(); } void __init vfs_caches_init(void) { names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); dcache_init(); inode_init(); files_init(); files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); }
2 2 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> struct nft_range_expr { struct nft_data data_from; struct nft_data data_to; u8 sreg; u8 len; enum nft_range_ops op:8; }; void nft_range_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_range_expr *priv = nft_expr_priv(expr); int d1, d2; d1 = memcmp(&regs->data[priv->sreg], &priv->data_from, priv->len); d2 = memcmp(&regs->data[priv->sreg], &priv->data_to, priv->len); switch (priv->op) { case NFT_RANGE_EQ: if (d1 < 0 || d2 > 0) regs->verdict.code = NFT_BREAK; break; case NFT_RANGE_NEQ: if (d1 >= 0 && d2 <= 0) regs->verdict.code = NFT_BREAK; break; } } static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = { [NFTA_RANGE_SREG] = { .type = NLA_U32 }, [NFTA_RANGE_OP] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED }, [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED }, }; static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_range_expr *priv = nft_expr_priv(expr); struct nft_data_desc desc_from = { .type = NFT_DATA_VALUE, .size = sizeof(priv->data_from), }; struct nft_data_desc desc_to = { .type = NFT_DATA_VALUE, .size = sizeof(priv->data_to), }; int err; u32 op; if (!tb[NFTA_RANGE_SREG] || !tb[NFTA_RANGE_OP] || !tb[NFTA_RANGE_FROM_DATA] || !tb[NFTA_RANGE_TO_DATA]) return -EINVAL; err = nft_data_init(NULL, &priv->data_from, &desc_from, tb[NFTA_RANGE_FROM_DATA]); if (err < 0) return err; err = nft_data_init(NULL, &priv->data_to, &desc_to, tb[NFTA_RANGE_TO_DATA]); if (err < 0) goto err1; if (desc_from.len != desc_to.len) { err = -EINVAL; goto err2; } err = nft_parse_register_load(ctx, tb[NFTA_RANGE_SREG], &priv->sreg, desc_from.len); if (err < 0) goto err2; err = nft_parse_u32_check(tb[NFTA_RANGE_OP], U8_MAX, &op); if (err < 0) goto err2; switch (op) { case NFT_RANGE_EQ: case NFT_RANGE_NEQ: break; default: err = -EINVAL; goto err2; } priv->op = op; priv->len = desc_from.len; return 0; err2: nft_data_release(&priv->data_to, desc_to.type); err1: nft_data_release(&priv->data_from, desc_from.type); return err; } static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_range_expr *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_RANGE_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_RANGE_OP, htonl(priv->op))) goto nla_put_failure; if (nft_data_dump(skb, NFTA_RANGE_FROM_DATA, &priv->data_from, NFT_DATA_VALUE, priv->len) < 0 || nft_data_dump(skb, NFTA_RANGE_TO_DATA, &priv->data_to, NFT_DATA_VALUE, priv->len) < 0) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nft_expr_ops nft_range_ops = { .type = &nft_range_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_range_expr)), .eval = nft_range_eval, .init = nft_range_init, .dump = nft_range_dump, .reduce = NFT_REDUCE_READONLY, }; struct nft_expr_type nft_range_type __read_mostly = { .name = "range", .ops = &nft_range_ops, .policy = nft_range_policy, .maxattr = NFTA_RANGE_MAX, .owner = THIS_MODULE, };
819 3040 96 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM skb #if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SKB_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #undef FN #define FN(reason) TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason); DEFINE_DROP_REASON(FN, FN) #undef FN #undef FNe #define FN(reason) { SKB_DROP_REASON_##reason, #reason }, #define FNe(reason) { SKB_DROP_REASON_##reason, #reason } /* * Tracepoint for free an sk_buff: */ TRACE_EVENT(kfree_skb, TP_PROTO(struct sk_buff *skb, void *location, enum skb_drop_reason reason, struct sock *rx_sk), TP_ARGS(skb, location, reason, rx_sk), TP_STRUCT__entry( __field(void *, skbaddr) __field(void *, location) __field(void *, rx_sk) __field(unsigned short, protocol) __field(enum skb_drop_reason, reason) ), TP_fast_assign( __entry->skbaddr = skb; __entry->location = location; __entry->rx_sk = rx_sk; __entry->protocol = ntohs(skb->protocol); __entry->reason = reason; ), TP_printk("skbaddr=%p rx_sk=%p protocol=%u location=%pS reason: %s", __entry->skbaddr, __entry->rx_sk, __entry->protocol, __entry->location, __print_symbolic(__entry->reason, DEFINE_DROP_REASON(FN, FNe))) ); #undef FN #undef FNe TRACE_EVENT(consume_skb, TP_PROTO(struct sk_buff *skb, void *location), TP_ARGS(skb, location), TP_STRUCT__entry( __field( void *, skbaddr) __field( void *, location) ), TP_fast_assign( __entry->skbaddr = skb; __entry->location = location; ), TP_printk("skbaddr=%p location=%pS", __entry->skbaddr, __entry->location) ); TRACE_EVENT(skb_copy_datagram_iovec, TP_PROTO(const struct sk_buff *skb, int len), TP_ARGS(skb, len), TP_STRUCT__entry( __field( const void *, skbaddr ) __field( int, len ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = len; ), TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len) ); #endif /* _TRACE_SKB_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
3 2 1 3 4 2 2 2 1 1 1 1 1 4 3 1 2 2 2 4 13 4 4 4 4 4 4 4 4 3 3 4 4 4 4 4 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 6 4 2 4 1 1 6 9 10 9 9 9 9 9 9 10 3 8 9 9 9 4 4 1 1 1 1 1 1 1 1 1 1 15 15 15 15 15 15 1 1 1 4 4 2 15 6 4 2 1 1 1 1 1 1 1 25 3 3 24 23 21 1 21 7 20 6 20 5 4 16 7 8 7 6 4 13 3 1 2 3 1 1 1 2 1 1 1 1 6 5 6 2 2 2 1 1 2 1 2 2 1 1 1 1 1 1 1 4 1 1 1 9 4 9 5 5 1 9 1 1 9 2 2 9 5 6 5 9 4 3 9 16 16 16 14 14 14 14 14 14 14 9 14 11 3 3 229 4 2 2 1 1 1 229 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> * * The code this is based on carried the following copyright notice: * --- * (C) Copyright 2001-2006 * Alex Zeffertt, Cambridge Broadband Ltd, ajz@cambridgebroadband.com * Re-worked by Ben Greear <greearb@candelatech.com> * --- */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/net_tstamp.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/if_link.h> #include <linux/if_macvlan.h> #include <linux/hash.h> #include <linux/workqueue.h> #include <net/netdev_lock.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <linux/netpoll.h> #include <linux/phy.h> #define MACVLAN_HASH_BITS 8 #define MACVLAN_HASH_SIZE (1<<MACVLAN_HASH_BITS) #define MACVLAN_DEFAULT_BC_QUEUE_LEN 1000 #define MACVLAN_F_PASSTHRU 1 #define MACVLAN_F_ADDRCHANGE 2 struct macvlan_port { struct net_device *dev; struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]; struct list_head vlans; struct sk_buff_head bc_queue; struct work_struct bc_work; u32 bc_queue_len_used; int bc_cutoff; u32 flags; int count; struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]; DECLARE_BITMAP(bc_filter, MACVLAN_MC_FILTER_SZ); DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); unsigned char perm_addr[ETH_ALEN]; }; struct macvlan_source_entry { struct hlist_node hlist; struct macvlan_dev *vlan; unsigned char addr[6+2] __aligned(sizeof(u16)); struct rcu_head rcu; }; struct macvlan_skb_cb { const struct macvlan_dev *src; }; #define MACVLAN_SKB_CB(__skb) ((struct macvlan_skb_cb *)&((__skb)->cb[0])) static void macvlan_port_destroy(struct net_device *dev); static void update_port_bc_queue_len(struct macvlan_port *port); static inline bool macvlan_passthru(const struct macvlan_port *port) { return port->flags & MACVLAN_F_PASSTHRU; } static inline void macvlan_set_passthru(struct macvlan_port *port) { port->flags |= MACVLAN_F_PASSTHRU; } static inline bool macvlan_addr_change(const struct macvlan_port *port) { return port->flags & MACVLAN_F_ADDRCHANGE; } static inline void macvlan_set_addr_change(struct macvlan_port *port) { port->flags |= MACVLAN_F_ADDRCHANGE; } static inline void macvlan_clear_addr_change(struct macvlan_port *port) { port->flags &= ~MACVLAN_F_ADDRCHANGE; } /* Hash Ethernet address */ static u32 macvlan_eth_hash(const unsigned char *addr) { u64 value = get_unaligned((u64 *)addr); /* only want 6 bytes */ #ifdef __BIG_ENDIAN value >>= 16; #else value <<= 16; #endif return hash_64(value, MACVLAN_HASH_BITS); } static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static struct macvlan_port *macvlan_port_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->rx_handler_data); } static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { struct macvlan_dev *vlan; u32 idx = macvlan_eth_hash(addr); hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } return NULL; } static struct macvlan_source_entry *macvlan_hash_lookup_source( const struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &vlan->port->vlan_source_hash[idx]; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (ether_addr_equal_64bits(entry->addr, addr) && entry->vlan == vlan) return entry; } return NULL; } static int macvlan_hash_add_source(struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_port *port = vlan->port; struct macvlan_source_entry *entry; struct hlist_head *h; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) return 0; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; ether_addr_copy(entry->addr, addr); entry->vlan = vlan; h = &port->vlan_source_hash[macvlan_eth_hash(addr)]; hlist_add_head_rcu(&entry->hlist, h); vlan->macaddr_count++; return 0; } static void macvlan_hash_add(struct macvlan_dev *vlan) { struct macvlan_port *port = vlan->port; const unsigned char *addr = vlan->dev->dev_addr; u32 idx = macvlan_eth_hash(addr); hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]); } static void macvlan_hash_del_source(struct macvlan_source_entry *entry) { hlist_del_rcu(&entry->hlist); kfree_rcu(entry, rcu); } static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) { hlist_del_rcu(&vlan->hlist); if (sync) synchronize_rcu(); } static void macvlan_hash_change_addr(struct macvlan_dev *vlan, const unsigned char *addr) { macvlan_hash_del(vlan, true); /* Now that we are unhashed it is safe to change the device * address without confusing packet delivery. */ eth_hw_addr_set(vlan->dev, addr); macvlan_hash_add(vlan); } static bool macvlan_addr_busy(const struct macvlan_port *port, const unsigned char *addr) { /* Test to see if the specified address is * currently in use by the underlying device or * another macvlan. */ if (!macvlan_passthru(port) && !macvlan_addr_change(port) && ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; if (macvlan_hash_lookup(port, addr)) return true; return false; } static int macvlan_broadcast_one(struct sk_buff *skb, const struct macvlan_dev *vlan, const struct ethhdr *eth, bool local) { struct net_device *dev = vlan->dev; if (local) return __dev_forward_skb(dev, skb); skb->dev = dev; if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; return 0; } static u32 macvlan_hash_mix(const struct macvlan_dev *vlan) { return (u32)(((unsigned long)vlan) >> L1_CACHE_SHIFT); } static unsigned int mc_hash(const struct macvlan_dev *vlan, const unsigned char *addr) { u32 val = get_unaligned((u32 *)(addr + 2)); val ^= macvlan_hash_mix(vlan); return hash_32(val, MACVLAN_MC_FILTER_BITS); } static void macvlan_broadcast(struct sk_buff *skb, const struct macvlan_port *port, struct net_device *src, enum macvlan_mode mode) { const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; struct sk_buff *nskb; unsigned int i; int err; unsigned int hash; if (skb->protocol == htons(ETH_P_PAUSE)) return; hash_for_each_rcu(port->vlan_hash, i, vlan, hlist) { if (vlan->dev == src || !(vlan->mode & mode)) continue; hash = mc_hash(vlan, eth->h_dest); if (!test_bit(hash, vlan->mc_filter)) continue; err = NET_RX_DROP; nskb = skb_clone(skb, GFP_ATOMIC); if (likely(nskb)) err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE) ?: netif_rx(nskb); macvlan_count_rx(vlan, skb->len + ETH_HLEN, err == NET_RX_SUCCESS, true); } } static void macvlan_multicast_rx(const struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { if (!src) /* frame comes from an external address */ macvlan_broadcast(skb, port, NULL, MACVLAN_MODE_PRIVATE | MACVLAN_MODE_VEPA | MACVLAN_MODE_PASSTHRU| MACVLAN_MODE_BRIDGE); else if (src->mode == MACVLAN_MODE_VEPA) /* flood to everyone except source */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA | MACVLAN_MODE_BRIDGE); else /* * flood only to VEPA ports, bridge ports * already saw the frame on the way out. */ macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA); } static void macvlan_process_broadcast(struct work_struct *w) { struct macvlan_port *port = container_of(w, struct macvlan_port, bc_work); struct sk_buff *skb; struct sk_buff_head list; __skb_queue_head_init(&list); spin_lock_bh(&port->bc_queue.lock); skb_queue_splice_tail_init(&port->bc_queue, &list); spin_unlock_bh(&port->bc_queue.lock); while ((skb = __skb_dequeue(&list))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; rcu_read_lock(); macvlan_multicast_rx(port, src, skb); rcu_read_unlock(); if (src) dev_put(src->dev); consume_skb(skb); cond_resched(); } } static void macvlan_broadcast_enqueue(struct macvlan_port *port, const struct macvlan_dev *src, struct sk_buff *skb) { struct sk_buff *nskb; int err = -ENOMEM; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) goto err; MACVLAN_SKB_CB(nskb)->src = src; spin_lock(&port->bc_queue.lock); if (skb_queue_len(&port->bc_queue) < port->bc_queue_len_used) { if (src) dev_hold(src->dev); __skb_queue_tail(&port->bc_queue, nskb); err = 0; } spin_unlock(&port->bc_queue.lock); queue_work(system_dfl_wq, &port->bc_work); if (err) goto free_nskb; return; free_nskb: kfree_skb(nskb); err: dev_core_stats_rx_dropped_inc(skb->dev); } static void macvlan_flush_sources(struct macvlan_port *port, struct macvlan_dev *vlan) { struct macvlan_source_entry *entry; struct hlist_node *next; int i; hash_for_each_safe(port->vlan_source_hash, i, next, entry, hlist) if (entry->vlan == vlan) macvlan_hash_del_source(entry); vlan->macaddr_count = 0; } static void macvlan_forward_source_one(struct sk_buff *skb, struct macvlan_dev *vlan) { struct sk_buff *nskb; struct net_device *dev; int len; int ret; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; len = nskb->len + ETH_HLEN; nskb->dev = dev; if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, dev->dev_addr)) nskb->pkt_type = PACKET_HOST; ret = __netif_rx(nskb); macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } static bool macvlan_forward_source(struct sk_buff *skb, struct macvlan_port *port, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &port->vlan_source_hash[idx]; bool consume = false; hlist_for_each_entry_rcu(entry, h, hlist) { if (ether_addr_equal_64bits(entry->addr, addr)) { if (entry->vlan->flags & MACVLAN_FLAG_NODST) consume = true; macvlan_forward_source_one(skb, entry->vlan); } } return consume; } /* called under rcu_read_lock() from netif_receive_skb */ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) { struct macvlan_port *port; struct sk_buff *skb = *pskb; const struct ethhdr *eth = eth_hdr(skb); const struct macvlan_dev *vlan; const struct macvlan_dev *src; struct net_device *dev; unsigned int len = 0; int ret; rx_handler_result_t handle_res; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; port = macvlan_port_get_rcu(skb->dev); if (is_multicast_ether_addr(eth->h_dest)) { unsigned int hash; skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN); if (!skb) return RX_HANDLER_CONSUMED; *pskb = skb; eth = eth_hdr(skb); if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { /* forward to original port. */ vlan = src; ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?: __netif_rx(skb); handle_res = RX_HANDLER_CONSUMED; goto out; } hash = mc_hash(NULL, eth->h_dest); if (test_bit(hash, port->bc_filter)) macvlan_broadcast_enqueue(port, src, skb); else if (test_bit(hash, port->mc_filter)) macvlan_multicast_rx(port, src, skb); return RX_HANDLER_PASS; } if (macvlan_forward_source(skb, port, eth->h_source)) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); else vlan = macvlan_hash_lookup(port, eth->h_dest); if (!vlan || vlan->mode == MACVLAN_MODE_SOURCE) return RX_HANDLER_PASS; dev = vlan->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } len = skb->len + ETH_HLEN; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { ret = NET_RX_DROP; handle_res = RX_HANDLER_CONSUMED; goto out; } *pskb = skb; skb->dev = dev; skb->pkt_type = PACKET_HOST; ret = NET_RX_SUCCESS; handle_res = RX_HANDLER_ANOTHER; out: macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); return handle_res; } static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { const struct macvlan_dev *vlan = netdev_priv(dev); const struct macvlan_port *port = vlan->port; const struct macvlan_dev *dest; if (vlan->mode == MACVLAN_MODE_BRIDGE) { const struct ethhdr *eth = skb_eth_hdr(skb); /* send to other bridge ports directly */ if (is_multicast_ether_addr(eth->h_dest)) { skb_reset_mac_header(skb); macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE); goto xmit_world; } dest = macvlan_hash_lookup(port, eth->h_dest); if (dest && dest->mode == MACVLAN_MODE_BRIDGE) { /* send to lowerdev first for its network taps */ dev_forward_skb(vlan->lowerdev, skb); return NET_XMIT_SUCCESS; } } xmit_world: skb->dev = vlan->lowerdev; return dev_queue_xmit_accel(skb, netdev_get_sb_channel(dev) ? dev : NULL); } static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); unsigned int len = skb->len; int ret; if (unlikely(netpoll_tx_running(dev))) return macvlan_netpoll_send_skb(vlan, skb); ret = macvlan_queue_xmit(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->tx_packets); u64_stats_add(&pcpu_stats->tx_bytes, len); u64_stats_update_end(&pcpu_stats->syncp); } else { this_cpu_inc(vlan->pcpu_stats->tx_dropped); } return ret; } static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { const struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return dev_hard_header(skb, lowerdev, type, daddr, saddr ? : dev->dev_addr, len); } static const struct header_ops macvlan_hard_header_ops = { .create = macvlan_hard_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, .parse_protocol = eth_header_parse_protocol, }; static int macvlan_open(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; int err; if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto out; } goto hash_add; } err = -EADDRINUSE; if (macvlan_addr_busy(vlan->port, dev->dev_addr)) goto out; /* Attempt to populate accel_priv which is used to offload the L2 * forwarding requests for unicast packets. */ if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) vlan->accel_priv = lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev); /* If earlier attempt to offload failed, or accel_priv is not * populated we must add the unicast address to the lower device. */ if (IS_ERR_OR_NULL(vlan->accel_priv)) { vlan->accel_priv = NULL; err = dev_uc_add(lowerdev, dev->dev_addr); if (err < 0) goto out; } if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(lowerdev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(lowerdev, 1); if (err < 0) goto clear_multi; } hash_add: macvlan_hash_add(vlan); return 0; clear_multi: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); del_unicast: if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } else { dev_uc_del(lowerdev, dev->dev_addr); } out: return err; } static int macvlan_stop(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (vlan->accel_priv) { lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev, vlan->accel_priv); vlan->accel_priv = NULL; } dev_uc_unsync(lowerdev, dev); dev_mc_unsync(lowerdev, dev); if (macvlan_passthru(vlan->port)) { if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) dev_set_promiscuity(lowerdev, -1); goto hash_del; } if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(lowerdev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(lowerdev, -1); dev_uc_del(lowerdev, dev->dev_addr); hash_del: macvlan_hash_del(vlan, !dev->dismantle); return 0; } static int macvlan_sync_address(struct net_device *dev, const unsigned char *addr) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; int err; if (!(dev->flags & IFF_UP)) { /* Just copy in the new address */ eth_hw_addr_set(dev, addr); } else { /* Rehash and update the device filters */ if (macvlan_addr_busy(vlan->port, addr)) return -EADDRINUSE; if (!macvlan_passthru(port)) { err = dev_uc_add(lowerdev, addr); if (err) return err; dev_uc_del(lowerdev, dev->dev_addr); } macvlan_hash_change_addr(vlan, addr); } if (macvlan_passthru(port) && !macvlan_addr_change(port)) { /* Since addr_change isn't set, we are here due to lower * device change. Save the lower-dev address so we can * restore it later. */ ether_addr_copy(vlan->port->perm_addr, lowerdev->dev_addr); } macvlan_clear_addr_change(port); return 0; } static int macvlan_set_mac_address(struct net_device *dev, void *p) { struct macvlan_dev *vlan = netdev_priv(dev); struct sockaddr_storage *addr = p; if (!is_valid_ether_addr(addr->__data)) return -EADDRNOTAVAIL; /* If the addresses are the same, this is a no-op */ if (ether_addr_equal(dev->dev_addr, addr->__data)) return 0; if (vlan->mode == MACVLAN_MODE_PASSTHRU) { macvlan_set_addr_change(vlan->port); return dev_set_mac_address(vlan->lowerdev, addr, NULL); } if (macvlan_addr_busy(vlan->port, addr->__data)) return -EADDRINUSE; return macvlan_sync_address(dev, addr->__data); } static void macvlan_change_rx_flags(struct net_device *dev, int change) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; if (dev->flags & IFF_UP) { if (change & IFF_ALLMULTI) dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC) dev_set_promiscuity(lowerdev, dev->flags & IFF_PROMISC ? 1 : -1); } } static void macvlan_compute_filter(unsigned long *mc_filter, struct net_device *dev, struct macvlan_dev *vlan, int cutoff) { if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { bitmap_fill(mc_filter, MACVLAN_MC_FILTER_SZ); } else { DECLARE_BITMAP(filter, MACVLAN_MC_FILTER_SZ); struct netdev_hw_addr *ha; bitmap_zero(filter, MACVLAN_MC_FILTER_SZ); netdev_for_each_mc_addr(ha, dev) { if (!vlan && ha->synced <= cutoff) continue; __set_bit(mc_hash(vlan, ha->addr), filter); } __set_bit(mc_hash(vlan, dev->broadcast), filter); bitmap_copy(mc_filter, filter, MACVLAN_MC_FILTER_SZ); } } static void macvlan_recompute_bc_filter(struct macvlan_dev *vlan) { if (vlan->port->bc_cutoff < 0) { bitmap_zero(vlan->port->bc_filter, MACVLAN_MC_FILTER_SZ); return; } macvlan_compute_filter(vlan->port->bc_filter, vlan->lowerdev, NULL, vlan->port->bc_cutoff); } static void macvlan_set_mac_lists(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); macvlan_compute_filter(vlan->mc_filter, dev, vlan, 0); dev_uc_sync(vlan->lowerdev, dev); dev_mc_sync(vlan->lowerdev, dev); /* This is slightly inaccurate as we're including the subscription * list of vlan->lowerdev too. * * Bug alert: This only works if everyone has the same broadcast * address as lowerdev. As soon as someone changes theirs this * will break. * * However, this is already broken as when you change your broadcast * address we don't get called. * * The solution is to maintain a list of broadcast addresses like * we do for uc/mc, if you care. */ macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL, 0); macvlan_recompute_bc_filter(vlan); } static void update_port_bc_cutoff(struct macvlan_dev *vlan, int cutoff) { if (vlan->port->bc_cutoff == cutoff) return; vlan->port->bc_cutoff = cutoff; macvlan_recompute_bc_filter(vlan); } static int macvlan_change_mtu(struct net_device *dev, int new_mtu) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->lowerdev->mtu < new_mtu) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int macvlan_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct net_device *real_dev = macvlan_dev_real_dev(dev); return generic_hwtstamp_get_lower(real_dev, cfg); } static int macvlan_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct net_device *real_dev = macvlan_dev_real_dev(dev); if (!net_eq(dev_net(dev), &init_net)) return -EOPNOTSUPP; return generic_hwtstamp_set_lower(real_dev, cfg, extack); } /* * macvlan network devices have devices nesting below it and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key macvlan_netdev_addr_lock_key; #define ALWAYS_ON_OFFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL) #define ALWAYS_ON_FEATURES ALWAYS_ON_OFFLOADS #define MACVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_LRO | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) #define MACVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) static void macvlan_set_lockdep_class(struct net_device *dev) { netdev_lockdep_set_classes(dev); lockdep_set_class(&dev->addr_list_lock, &macvlan_netdev_addr_lock_key); } static int macvlan_init(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; struct macvlan_port *port = vlan->port; dev->state = (dev->state & ~MACVLAN_STATE_MASK) | (lowerdev->state & MACVLAN_STATE_MASK); dev->features = lowerdev->features & MACVLAN_FEATURES; dev->features |= ALWAYS_ON_FEATURES; dev->hw_features |= NETIF_F_LRO; dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES; dev->vlan_features |= ALWAYS_ON_OFFLOADS; dev->hw_enc_features |= dev->features; dev->lltx = true; netif_inherit_tso_max(dev, lowerdev); dev->hard_header_len = lowerdev->hard_header_len; macvlan_set_lockdep_class(dev); vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) return -ENOMEM; port->count += 1; /* Get macvlan's reference to lowerdev */ netdev_hold(lowerdev, &vlan->dev_tracker, GFP_KERNEL); return 0; } static void macvlan_uninit(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; free_percpu(vlan->pcpu_stats); macvlan_flush_sources(port, vlan); port->count -= 1; if (!port->count) macvlan_port_destroy(port->dev); } static void macvlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->pcpu_stats) { struct vlan_pcpu_stats *p; u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes; u32 rx_errors = 0, tx_dropped = 0; unsigned int start; int i; for_each_possible_cpu(i) { p = per_cpu_ptr(vlan->pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rx_packets = u64_stats_read(&p->rx_packets); rx_bytes = u64_stats_read(&p->rx_bytes); rx_multicast = u64_stats_read(&p->rx_multicast); tx_packets = u64_stats_read(&p->tx_packets); tx_bytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; stats->multicast += rx_multicast; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; /* rx_errors & tx_dropped are u32, updated * without syncp protection. */ rx_errors += READ_ONCE(p->rx_errors); tx_dropped += READ_ONCE(p->tx_dropped); } stats->rx_errors = rx_errors; stats->rx_dropped = rx_errors; stats->tx_dropped = tx_dropped; } } static int macvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; return vlan_vid_add(lowerdev, proto, vid); } static int macvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *lowerdev = vlan->lowerdev; vlan_vid_del(lowerdev, proto, vid); return 0; } static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); return err; } static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; /* Support unicast filter only on passthru devices. * Multicast filter should be allowed on all devices. */ if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr)) return -EOPNOTSUPP; if (is_unicast_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } static void macvlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "macvlan", sizeof(drvinfo->driver)); strscpy(drvinfo->version, "0.1", sizeof(drvinfo->version)); } static int macvlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct macvlan_dev *vlan = netdev_priv(dev); return __ethtool_get_link_ksettings(vlan->lowerdev, cmd); } static int macvlan_ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) { struct net_device *real_dev = macvlan_dev_real_dev(dev); return ethtool_get_ts_info_by_layer(real_dev, info); } static netdev_features_t macvlan_fix_features(struct net_device *dev, netdev_features_t features) { struct macvlan_dev *vlan = netdev_priv(dev); netdev_features_t lowerdev_features = vlan->lowerdev->features; netdev_features_t mask; features |= NETIF_F_ALL_FOR_ALL; features &= (vlan->set_features | ~MACVLAN_FEATURES); mask = features; lowerdev_features &= (features | ~NETIF_F_LRO); features = netdev_increment_features(lowerdev_features, features, mask); features |= ALWAYS_ON_FEATURES; features &= (ALWAYS_ON_FEATURES | MACVLAN_FEATURES); return features; } #ifdef CONFIG_NET_POLL_CONTROLLER static void macvlan_dev_poll_controller(struct net_device *dev) { return; } static int macvlan_dev_netpoll_setup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *real_dev = vlan->lowerdev; struct netpoll *netpoll; int err; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); err = -ENOMEM; if (!netpoll) goto out; err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; } vlan->netpoll = netpoll; out: return err; } static void macvlan_dev_netpoll_cleanup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; vlan->netpoll = NULL; __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int macvlan_dev_get_iflink(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return READ_ONCE(vlan->lowerdev->ifindex); } static const struct ethtool_ops macvlan_ethtool_ops = { .get_link = ethtool_op_get_link, .get_link_ksettings = macvlan_ethtool_get_link_ksettings, .get_drvinfo = macvlan_ethtool_get_drvinfo, .get_ts_info = macvlan_ethtool_get_ts_info, }; static const struct net_device_ops macvlan_netdev_ops = { .ndo_init = macvlan_init, .ndo_uninit = macvlan_uninit, .ndo_open = macvlan_open, .ndo_stop = macvlan_stop, .ndo_start_xmit = macvlan_start_xmit, .ndo_change_mtu = macvlan_change_mtu, .ndo_fix_features = macvlan_fix_features, .ndo_change_rx_flags = macvlan_change_rx_flags, .ndo_set_mac_address = macvlan_set_mac_address, .ndo_set_rx_mode = macvlan_set_mac_lists, .ndo_get_stats64 = macvlan_dev_get_stats64, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = macvlan_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = macvlan_vlan_rx_kill_vid, .ndo_fdb_add = macvlan_fdb_add, .ndo_fdb_del = macvlan_fdb_del, .ndo_fdb_dump = ndo_dflt_fdb_dump, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = macvlan_dev_poll_controller, .ndo_netpoll_setup = macvlan_dev_netpoll_setup, .ndo_netpoll_cleanup = macvlan_dev_netpoll_cleanup, #endif .ndo_get_iflink = macvlan_dev_get_iflink, .ndo_features_check = passthru_features_check, .ndo_hwtstamp_get = macvlan_hwtstamp_get, .ndo_hwtstamp_set = macvlan_hwtstamp_set, }; static void macvlan_dev_free(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); /* Get rid of the macvlan's reference to lowerdev */ netdev_put(vlan->lowerdev, &vlan->dev_tracker); } void macvlan_common_setup(struct net_device *dev) { ether_setup(dev); /* ether_setup() has set dev->min_mtu to ETH_MIN_MTU. */ dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); dev->priv_flags |= IFF_UNICAST_FLT; dev->change_proto_down = true; dev->netdev_ops = &macvlan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = macvlan_dev_free; dev->header_ops = &macvlan_hard_header_ops; dev->ethtool_ops = &macvlan_ethtool_ops; } EXPORT_SYMBOL_GPL(macvlan_common_setup); static void macvlan_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->priv_flags |= IFF_NO_QUEUE; } static int macvlan_port_create(struct net_device *dev) { struct macvlan_port *port; unsigned int i; int err; if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) return -EINVAL; if (netdev_is_rx_handler_busy(dev)) return -EBUSY; port = kzalloc(sizeof(*port), GFP_KERNEL); if (port == NULL) return -ENOMEM; port->dev = dev; ether_addr_copy(port->perm_addr, dev->dev_addr); INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_source_hash[i]); port->bc_queue_len_used = 0; port->bc_cutoff = 1; skb_queue_head_init(&port->bc_queue); INIT_WORK(&port->bc_work, macvlan_process_broadcast); err = netdev_rx_handler_register(dev, macvlan_handle_frame, port); if (err) kfree(port); else dev->priv_flags |= IFF_MACVLAN_PORT; return err; } static void macvlan_port_destroy(struct net_device *dev) { struct macvlan_port *port = macvlan_port_get_rtnl(dev); struct sk_buff *skb; dev->priv_flags &= ~IFF_MACVLAN_PORT; netdev_rx_handler_unregister(dev); /* After this point, no packet can schedule bc_work anymore, * but we need to cancel it and purge left skbs if any. */ cancel_work_sync(&port->bc_work); while ((skb = __skb_dequeue(&port->bc_queue))) { const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src; if (src) dev_put(src->dev); kfree_skb(skb); } /* If the lower device address has been changed by passthru * macvlan, put it back. */ if (macvlan_passthru(port) && !ether_addr_equal(port->dev->dev_addr, port->perm_addr)) { struct sockaddr_storage ss; ss.ss_family = port->dev->type; memcpy(&ss.__data, port->perm_addr, port->dev->addr_len); dev_set_mac_address(port->dev, &ss, NULL); } kfree(port); } static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct nlattr *nla, *head; int rem, len; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } if (!data) return 0; if (data[IFLA_MACVLAN_FLAGS] && nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC | MACVLAN_FLAG_NODST)) return -EINVAL; if (data[IFLA_MACVLAN_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) { case MACVLAN_MODE_PRIVATE: case MACVLAN_MODE_VEPA: case MACVLAN_MODE_BRIDGE: case MACVLAN_MODE_PASSTHRU: case MACVLAN_MODE_SOURCE: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR_MODE]) { switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) { case MACVLAN_MACADDR_ADD: case MACVLAN_MACADDR_DEL: case MACVLAN_MACADDR_FLUSH: case MACVLAN_MACADDR_SET: break; default: return -EINVAL; } } if (data[IFLA_MACVLAN_MACADDR]) { if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(data[IFLA_MACVLAN_MACADDR]))) return -EADDRNOTAVAIL; } if (data[IFLA_MACVLAN_MACADDR_DATA]) { head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { if (nla_type(nla) != IFLA_MACVLAN_MACADDR || nla_len(nla) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(nla))) return -EADDRNOTAVAIL; } } if (data[IFLA_MACVLAN_MACADDR_COUNT]) return -EINVAL; return 0; } /* * reconfigure list of remote source mac address * (only for macvlan devices in source mode) * Note regarding alignment: all netlink data is aligned to 4 Byte, which * suffices for both ether_addr_copy and ether_addr_equal_64bits usage. */ static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, struct nlattr *data[]) { char *addr = NULL; int ret, rem, len; struct nlattr *nla, *head; struct macvlan_source_entry *entry; if (data[IFLA_MACVLAN_MACADDR]) addr = nla_data(data[IFLA_MACVLAN_MACADDR]); if (mode == MACVLAN_MACADDR_ADD) { if (!addr) return -EINVAL; return macvlan_hash_add_source(vlan, addr); } else if (mode == MACVLAN_MACADDR_DEL) { if (!addr) return -EINVAL; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) { macvlan_hash_del_source(entry); vlan->macaddr_count--; } } else if (mode == MACVLAN_MACADDR_FLUSH) { macvlan_flush_sources(vlan->port, vlan); } else if (mode == MACVLAN_MACADDR_SET) { macvlan_flush_sources(vlan->port, vlan); if (addr) { ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } if (!data[IFLA_MACVLAN_MACADDR_DATA]) return 0; head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); nla_for_each_attr(nla, head, len, rem) { addr = nla_data(nla); ret = macvlan_hash_add_source(vlan, addr); if (ret) return ret; } } else { return -EINVAL; } return 0; } int macvlan_common_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct macvlan_dev *vlan = netdev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct net_device *lowerdev; struct macvlan_port *port; bool create = false; int macmode; int err; if (!tb[IFLA_LINK]) return -EINVAL; lowerdev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (lowerdev == NULL) return -ENODEV; /* When creating macvlans or macvtaps on top of other macvlans - use * the real device as the lowerdev. */ if (netif_is_macvlan(lowerdev)) lowerdev = macvlan_dev_real_dev(lowerdev); if (!tb[IFLA_MTU]) dev->mtu = lowerdev->mtu; else if (dev->mtu > lowerdev->mtu) return -EINVAL; /* MTU range: 68 - lowerdev->max_mtu */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = lowerdev->max_mtu; if (!tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); if (!netif_is_macvlan_port(lowerdev)) { err = macvlan_port_create(lowerdev); if (err < 0) return err; create = true; } port = macvlan_port_get_rtnl(lowerdev); /* Only 1 macvlan device can be created in passthru mode */ if (macvlan_passthru(port)) { /* The macvlan port must be not created this time, * still goto destroy_macvlan_port for readability. */ err = -EINVAL; goto destroy_macvlan_port; } vlan->lowerdev = lowerdev; vlan->dev = dev; vlan->port = port; vlan->set_features = MACVLAN_FEATURES; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); if (data && data[IFLA_MACVLAN_FLAGS]) vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); if (vlan->mode == MACVLAN_MODE_PASSTHRU) { if (port->count) { err = -EINVAL; goto destroy_macvlan_port; } macvlan_set_passthru(port); eth_hw_addr_inherit(dev, lowerdev); } if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) { err = -EINVAL; goto destroy_macvlan_port; } macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); err = macvlan_changelink_sources(vlan, macmode, data); if (err) goto destroy_macvlan_port; } vlan->bc_queue_len_req = MACVLAN_DEFAULT_BC_QUEUE_LEN; if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); if (data && data[IFLA_MACVLAN_BC_CUTOFF]) update_port_bc_cutoff( vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF])); err = register_netdevice(dev); if (err < 0) goto destroy_macvlan_port; dev->priv_flags |= IFF_MACVLAN; err = netdev_upper_dev_link(lowerdev, dev, extack); if (err) goto unregister_netdev; list_add_tail_rcu(&vlan->list, &port->vlans); update_port_bc_queue_len(vlan->port); netif_stacked_transfer_operstate(lowerdev, dev); linkwatch_fire_event(dev); return 0; unregister_netdev: /* macvlan_uninit would free the macvlan port */ unregister_netdevice(dev); return err; destroy_macvlan_port: /* the macvlan port may be freed by macvlan_uninit when fail to register. * so we destroy the macvlan port only when it's valid. */ if (create && macvlan_port_get_rtnl(lowerdev)) { macvlan_flush_sources(port, vlan); macvlan_port_destroy(port->dev); } return err; } EXPORT_SYMBOL_GPL(macvlan_common_newlink); static int macvlan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { return macvlan_common_newlink(dev, params, extack); } void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); if (vlan->mode == MACVLAN_MODE_SOURCE) macvlan_flush_sources(vlan->port, vlan); list_del_rcu(&vlan->list); update_port_bc_queue_len(vlan->port); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(vlan->lowerdev, dev); } EXPORT_SYMBOL_GPL(macvlan_dellink); static int macvlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); enum macvlan_mode mode; bool set_mode = false; enum macvlan_macaddr_mode macmode; int ret; /* Validate mode, but don't set yet: setting flags may fail. */ if (data && data[IFLA_MACVLAN_MODE]) { set_mode = true; mode = nla_get_u32(data[IFLA_MACVLAN_MODE]); /* Passthrough mode can't be set or cleared dynamically */ if ((mode == MACVLAN_MODE_PASSTHRU) != (vlan->mode == MACVLAN_MODE_PASSTHRU)) return -EINVAL; if (vlan->mode == MACVLAN_MODE_SOURCE && vlan->mode != mode) macvlan_flush_sources(vlan->port, vlan); } if (data && data[IFLA_MACVLAN_FLAGS]) { __u16 flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); bool promisc = (flags ^ vlan->flags) & MACVLAN_FLAG_NOPROMISC; if (macvlan_passthru(vlan->port) && promisc) { int err; if (flags & MACVLAN_FLAG_NOPROMISC) err = dev_set_promiscuity(vlan->lowerdev, -1); else err = dev_set_promiscuity(vlan->lowerdev, 1); if (err < 0) return err; } vlan->flags = flags; } if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) { vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]); update_port_bc_queue_len(vlan->port); } if (data && data[IFLA_MACVLAN_BC_CUTOFF]) update_port_bc_cutoff( vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF])); if (set_mode) vlan->mode = mode; if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { if (vlan->mode != MACVLAN_MODE_SOURCE) return -EINVAL; macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); ret = macvlan_changelink_sources(vlan, macmode, data); if (ret) return ret; } return 0; } static size_t macvlan_get_size_mac(const struct macvlan_dev *vlan) { if (vlan->macaddr_count == 0) return 0; return nla_total_size(0) /* IFLA_MACVLAN_MACADDR_DATA */ + vlan->macaddr_count * nla_total_size(sizeof(u8) * ETH_ALEN); } static size_t macvlan_get_size(const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); return (0 + nla_total_size(4) /* IFLA_MACVLAN_MODE */ + nla_total_size(2) /* IFLA_MACVLAN_FLAGS */ + nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */ + macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN */ + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN_USED */ ); } static int macvlan_fill_info_macaddr(struct sk_buff *skb, const struct macvlan_dev *vlan, const int i) { struct hlist_head *h = &vlan->port->vlan_source_hash[i]; struct macvlan_source_entry *entry; hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) { if (entry->vlan != vlan) continue; if (nla_put(skb, IFLA_MACVLAN_MACADDR, ETH_ALEN, entry->addr)) return 1; } return 0; } static int macvlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct macvlan_port *port = vlan->port; int i; struct nlattr *nest; if (nla_put_u32(skb, IFLA_MACVLAN_MODE, vlan->mode)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_MACVLAN_FLAGS, vlan->flags)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_MACADDR_COUNT, vlan->macaddr_count)) goto nla_put_failure; if (vlan->macaddr_count > 0) { nest = nla_nest_start_noflag(skb, IFLA_MACVLAN_MACADDR_DATA); if (nest == NULL) goto nla_put_failure; for (i = 0; i < MACVLAN_HASH_SIZE; i++) { if (macvlan_fill_info_macaddr(skb, vlan, i)) goto nla_put_failure; } nla_nest_end(skb, nest); } if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN, vlan->bc_queue_len_req)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN_USED, port->bc_queue_len_used)) goto nla_put_failure; if (port->bc_cutoff != 1 && nla_put_s32(skb, IFLA_MACVLAN_BC_CUTOFF, port->bc_cutoff)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 }, [IFLA_MACVLAN_MACADDR_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED }, [IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN] = { .type = NLA_U32 }, [IFLA_MACVLAN_BC_QUEUE_LEN_USED] = { .type = NLA_REJECT }, [IFLA_MACVLAN_BC_CUTOFF] = { .type = NLA_S32 }, }; int macvlan_link_register(struct rtnl_link_ops *ops) { /* common fields */ ops->validate = macvlan_validate; ops->maxtype = IFLA_MACVLAN_MAX; ops->policy = macvlan_policy; ops->changelink = macvlan_changelink; ops->get_size = macvlan_get_size; ops->fill_info = macvlan_fill_info; return rtnl_link_register(ops); }; EXPORT_SYMBOL_GPL(macvlan_link_register); static struct net *macvlan_get_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvlan_link_ops = { .kind = "macvlan", .setup = macvlan_setup, .newlink = macvlan_newlink, .dellink = macvlan_dellink, .get_link_net = macvlan_get_link_net, .priv_size = sizeof(struct macvlan_dev), }; static void update_port_bc_queue_len(struct macvlan_port *port) { u32 max_bc_queue_len_req = 0; struct macvlan_dev *vlan; list_for_each_entry(vlan, &port->vlans, list) { if (vlan->bc_queue_len_req > max_bc_queue_len_req) max_bc_queue_len_req = vlan->bc_queue_len_req; } port->bc_queue_len_used = max_bc_queue_len_req; } static int macvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvlan_dev *vlan, *next; struct macvlan_port *port; LIST_HEAD(list_kill); if (!netif_is_macvlan_port(dev)) return NOTIFY_DONE; port = macvlan_port_get_rtnl(dev); switch (event) { case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: list_for_each_entry(vlan, &port->vlans, list) netif_stacked_transfer_operstate(vlan->lowerdev, vlan->dev); break; case NETDEV_FEAT_CHANGE: list_for_each_entry(vlan, &port->vlans, list) { netif_inherit_tso_max(vlan->dev, dev); netdev_update_features(vlan->dev); } break; case NETDEV_CHANGEMTU: list_for_each_entry(vlan, &port->vlans, list) { if (vlan->dev->mtu <= dev->mtu) continue; dev_set_mtu(vlan->dev, dev->mtu); } break; case NETDEV_CHANGEADDR: if (!macvlan_passthru(port)) return NOTIFY_DONE; vlan = list_first_entry_or_null(&port->vlans, struct macvlan_dev, list); if (vlan && macvlan_sync_address(vlan->dev, dev->dev_addr)) return NOTIFY_BAD; break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; list_for_each_entry_safe(vlan, next, &port->vlans, list) vlan->dev->rtnl_link_ops->dellink(vlan->dev, &list_kill); unregister_netdevice_many(&list_kill); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to all vlans */ list_for_each_entry(vlan, &port->vlans, list) call_netdevice_notifiers(event, vlan->dev); } return NOTIFY_DONE; } static struct notifier_block macvlan_notifier_block __read_mostly = { .notifier_call = macvlan_device_event, }; static int __init macvlan_init_module(void) { int err; register_netdevice_notifier(&macvlan_notifier_block); err = macvlan_link_register(&macvlan_link_ops); if (err < 0) goto err1; return 0; err1: unregister_netdevice_notifier(&macvlan_notifier_block); return err; } static void __exit macvlan_cleanup_module(void) { rtnl_link_unregister(&macvlan_link_ops); unregister_netdevice_notifier(&macvlan_notifier_block); } module_init(macvlan_init_module); module_exit(macvlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Driver for MAC address based VLANs"); MODULE_ALIAS_RTNL_LINK("macvlan");
1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 // SPDX-License-Identifier: GPL-2.0-or-later /* * USB RedRat3 IR Transceiver rc-core driver * * Copyright (c) 2011 by Jarod Wilson <jarod@redhat.com> * based heavily on the work of Stephen Cox, with additional * help from RedRat Ltd. * * This driver began life based on an old version of the first-generation * lirc_mceusb driver from the lirc 0.7.2 distribution. It was then * significantly rewritten by Stephen Cox with the aid of RedRat Ltd's * Chris Dodge. * * The driver was then ported to rc-core and significantly rewritten again, * by Jarod, using the in-kernel mceusb driver as a guide, after an initial * port effort was started by Stephen. * * TODO LIST: * - fix lirc not showing repeats properly * -- * * The RedRat3 is a USB transceiver with both send & receive, * with 2 separate sensors available for receive to enable * both good long range reception for general use, and good * short range reception when required for learning a signal. * * http://www.redrat.co.uk/ * * It uses its own little protocol to communicate, the required * parts of which are embedded within this driver. * -- */ #include <linux/unaligned.h> #include <linux/device.h> #include <linux/leds.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/usb/input.h> #include <media/rc-core.h> /* Driver Information */ #define DRIVER_AUTHOR "Jarod Wilson <jarod@redhat.com>" #define DRIVER_AUTHOR2 "The Dweller, Stephen Cox" #define DRIVER_DESC "RedRat3 USB IR Transceiver Driver" #define DRIVER_NAME "redrat3" /* bulk data transfer types */ #define RR3_ERROR 0x01 #define RR3_MOD_SIGNAL_IN 0x20 #define RR3_MOD_SIGNAL_OUT 0x21 /* Get the RR firmware version */ #define RR3_FW_VERSION 0xb1 #define RR3_FW_VERSION_LEN 64 /* Send encoded signal bulk-sent earlier*/ #define RR3_TX_SEND_SIGNAL 0xb3 #define RR3_SET_IR_PARAM 0xb7 #define RR3_GET_IR_PARAM 0xb8 /* Blink the red LED on the device */ #define RR3_BLINK_LED 0xb9 /* Read serial number of device */ #define RR3_READ_SER_NO 0xba #define RR3_SER_NO_LEN 4 /* Start capture with the RC receiver */ #define RR3_RC_DET_ENABLE 0xbb /* Stop capture with the RC receiver */ #define RR3_RC_DET_DISABLE 0xbc /* Start capture with the wideband receiver */ #define RR3_MODSIG_CAPTURE 0xb2 /* Return the status of RC detector capture */ #define RR3_RC_DET_STATUS 0xbd /* Reset redrat */ #define RR3_RESET 0xa0 /* Max number of lengths in the signal. */ #define RR3_IR_IO_MAX_LENGTHS 0x01 /* Periods to measure mod. freq. */ #define RR3_IR_IO_PERIODS_MF 0x02 /* Size of memory for main signal data */ #define RR3_IR_IO_SIG_MEM_SIZE 0x03 /* Delta value when measuring lengths */ #define RR3_IR_IO_LENGTH_FUZZ 0x04 /* Timeout for end of signal detection */ #define RR3_IR_IO_SIG_TIMEOUT 0x05 /* Minimum value for pause recognition. */ #define RR3_IR_IO_MIN_PAUSE 0x06 /* Clock freq. of EZ-USB chip */ #define RR3_CLK 24000000 /* Clock periods per timer count */ #define RR3_CLK_PER_COUNT 12 /* (RR3_CLK / RR3_CLK_PER_COUNT) */ #define RR3_CLK_CONV_FACTOR 2000000 /* USB bulk-in wideband IR data endpoint address */ #define RR3_WIDE_IN_EP_ADDR 0x81 /* USB bulk-in narrowband IR data endpoint address */ #define RR3_NARROW_IN_EP_ADDR 0x82 /* Size of the fixed-length portion of the signal */ #define RR3_DRIVER_MAXLENS 255 #define RR3_MAX_SIG_SIZE 512 #define RR3_TIME_UNIT 50 #define RR3_END_OF_SIGNAL 0x7f #define RR3_TX_TRAILER_LEN 2 #define RR3_RX_MIN_TIMEOUT 5 #define RR3_RX_MAX_TIMEOUT 2000 /* The 8051's CPUCS Register address */ #define RR3_CPUCS_REG_ADDR 0x7f92 #define USB_RR3USB_VENDOR_ID 0x112a #define USB_RR3USB_PRODUCT_ID 0x0001 #define USB_RR3IIUSB_PRODUCT_ID 0x0005 /* * The redrat3 encodes an IR signal as set of different lengths and a set * of indices into those lengths. This sets how much two lengths must * differ before they are considered distinct, the value is specified * in microseconds. * Default 5, value 0 to 127. */ static int length_fuzz = 5; module_param(length_fuzz, uint, 0644); MODULE_PARM_DESC(length_fuzz, "Length Fuzz (0-127)"); /* * When receiving a continuous ir stream (for example when a user is * holding a button down on a remote), this specifies the minimum size * of a space when the redrat3 sends a irdata packet to the host. Specified * in milliseconds. Default value 18ms. * The value can be between 2 and 30 inclusive. */ static int minimum_pause = 18; module_param(minimum_pause, uint, 0644); MODULE_PARM_DESC(minimum_pause, "Minimum Pause in ms (2-30)"); /* * The carrier frequency is measured during the first pulse of the IR * signal. The larger the number of periods used To measure, the more * accurate the result is likely to be, however some signals have short * initial pulses, so in some case it may be necessary to reduce this value. * Default 8, value 1 to 255. */ static int periods_measure_carrier = 8; module_param(periods_measure_carrier, uint, 0644); MODULE_PARM_DESC(periods_measure_carrier, "Number of Periods to Measure Carrier (1-255)"); struct redrat3_header { __be16 length; __be16 transfer_type; } __packed; /* sending and receiving irdata */ struct redrat3_irdata { struct redrat3_header header; __be32 pause; __be16 mod_freq_count; __be16 num_periods; __u8 max_lengths; __u8 no_lengths; __be16 max_sig_size; __be16 sig_size; __u8 no_repeats; __be16 lens[RR3_DRIVER_MAXLENS]; /* not aligned */ __u8 sigdata[RR3_MAX_SIG_SIZE]; } __packed; /* firmware errors */ struct redrat3_error { struct redrat3_header header; __be16 fw_error; } __packed; /* table of devices that work with this driver */ static const struct usb_device_id redrat3_dev_table[] = { /* Original version of the RedRat3 */ {USB_DEVICE(USB_RR3USB_VENDOR_ID, USB_RR3USB_PRODUCT_ID)}, /* Second Version/release of the RedRat3 - RetRat3-II */ {USB_DEVICE(USB_RR3USB_VENDOR_ID, USB_RR3IIUSB_PRODUCT_ID)}, {} /* Terminating entry */ }; /* Structure to hold all of our device specific stuff */ struct redrat3_dev { /* core device bits */ struct rc_dev *rc; struct device *dev; /* led control */ struct led_classdev led; atomic_t flash; struct usb_ctrlrequest flash_control; struct urb *flash_urb; u8 flash_in_buf; /* learning */ bool wideband; struct usb_ctrlrequest learn_control; struct urb *learn_urb; u8 learn_buf; /* save off the usb device pointer */ struct usb_device *udev; /* the receive endpoint */ struct usb_endpoint_descriptor *ep_narrow; /* the buffer to receive data */ void *bulk_in_buf; /* urb used to read ir data */ struct urb *narrow_urb; struct urb *wide_urb; /* the send endpoint */ struct usb_endpoint_descriptor *ep_out; /* usb dma */ dma_addr_t dma_in; /* Is the device currently transmitting?*/ bool transmitting; /* store for current packet */ struct redrat3_irdata irdata; u16 bytes_read; u32 carrier; char name[64]; char phys[64]; }; static void redrat3_dump_fw_error(struct redrat3_dev *rr3, int code) { if (!rr3->transmitting && (code != 0x40)) dev_info(rr3->dev, "fw error code 0x%02x: ", code); switch (code) { case 0x00: pr_cont("No Error\n"); break; /* Codes 0x20 through 0x2f are IR Firmware Errors */ case 0x20: pr_cont("Initial signal pulse not long enough to measure carrier frequency\n"); break; case 0x21: pr_cont("Not enough length values allocated for signal\n"); break; case 0x22: pr_cont("Not enough memory allocated for signal data\n"); break; case 0x23: pr_cont("Too many signal repeats\n"); break; case 0x28: pr_cont("Insufficient memory available for IR signal data memory allocation\n"); break; case 0x29: pr_cont("Insufficient memory available for IrDa signal data memory allocation\n"); break; /* Codes 0x30 through 0x3f are USB Firmware Errors */ case 0x30: pr_cont("Insufficient memory available for bulk transfer structure\n"); break; /* * Other error codes... These are primarily errors that can occur in * the control messages sent to the redrat */ case 0x40: if (!rr3->transmitting) pr_cont("Signal capture has been terminated\n"); break; case 0x41: pr_cont("Attempt to set/get and unknown signal I/O algorithm parameter\n"); break; case 0x42: pr_cont("Signal capture already started\n"); break; default: pr_cont("Unknown Error\n"); break; } } static u32 redrat3_val_to_mod_freq(struct redrat3_irdata *irdata) { u32 mod_freq = 0; u16 mod_freq_count = be16_to_cpu(irdata->mod_freq_count); if (mod_freq_count != 0) mod_freq = (RR3_CLK * be16_to_cpu(irdata->num_periods)) / (mod_freq_count * RR3_CLK_PER_COUNT); return mod_freq; } /* this function scales down the figures for the same result... */ static u32 redrat3_len_to_us(u32 length) { u32 biglen = length * 1000; u32 divisor = (RR3_CLK_CONV_FACTOR) / 1000; u32 result = (u32) (biglen / divisor); /* don't allow zero lengths to go back, breaks lirc */ return result ? result : 1; } /* * convert us back into redrat3 lengths * * length * 1000 length * 1000000 * ------------- = ---------------- = micro * rr3clk / 1000 rr3clk * 6 * 2 4 * 3 micro * rr3clk micro * rr3clk / 1000 * ----- = 4 ----- = 6 -------------- = len --------------------- * 3 2 1000000 1000 */ static u32 redrat3_us_to_len(u32 microsec) { u32 result; u32 divisor; microsec = (microsec > IR_MAX_DURATION) ? IR_MAX_DURATION : microsec; divisor = (RR3_CLK_CONV_FACTOR / 1000); result = (u32)(microsec * divisor) / 1000; /* don't allow zero lengths to go back, breaks lirc */ return result ? result : 1; } static void redrat3_process_ir_data(struct redrat3_dev *rr3) { struct ir_raw_event rawir = {}; struct device *dev; unsigned int i, sig_size, offset, val; u32 mod_freq; dev = rr3->dev; mod_freq = redrat3_val_to_mod_freq(&rr3->irdata); dev_dbg(dev, "Got mod_freq of %u\n", mod_freq); if (mod_freq && rr3->wideband) { struct ir_raw_event ev = { .carrier_report = 1, .carrier = mod_freq }; ir_raw_event_store(rr3->rc, &ev); } /* process each rr3 encoded byte into an int */ sig_size = be16_to_cpu(rr3->irdata.sig_size); for (i = 0; i < sig_size; i++) { offset = rr3->irdata.sigdata[i]; val = get_unaligned_be16(&rr3->irdata.lens[offset]); /* we should always get pulse/space/pulse/space samples */ if (i % 2) rawir.pulse = false; else rawir.pulse = true; rawir.duration = redrat3_len_to_us(val); /* cap the value to IR_MAX_DURATION */ rawir.duration = (rawir.duration > IR_MAX_DURATION) ? IR_MAX_DURATION : rawir.duration; dev_dbg(dev, "storing %s with duration %d (i: %d)\n", rawir.pulse ? "pulse" : "space", rawir.duration, i); ir_raw_event_store_with_filter(rr3->rc, &rawir); } /* add a trailing space */ rawir.pulse = false; rawir.timeout = true; rawir.duration = rr3->rc->timeout; dev_dbg(dev, "storing trailing timeout with duration %d\n", rawir.duration); ir_raw_event_store_with_filter(rr3->rc, &rawir); dev_dbg(dev, "calling ir_raw_event_handle\n"); ir_raw_event_handle(rr3->rc); } /* Util fn to send rr3 cmds */ static int redrat3_send_cmd(int cmd, struct redrat3_dev *rr3) { struct usb_device *udev; u8 *data; int res; data = kzalloc(sizeof(u8), GFP_KERNEL); if (!data) return -ENOMEM; udev = rr3->udev; res = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), cmd, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x0000, 0x0000, data, sizeof(u8), 10000); if (res < 0) { dev_err(rr3->dev, "%s: Error sending rr3 cmd res %d, data %d", __func__, res, *data); res = -EIO; } else res = data[0]; kfree(data); return res; } /* Enables the long range detector and starts async receive */ static int redrat3_enable_detector(struct redrat3_dev *rr3) { struct device *dev = rr3->dev; int ret; ret = redrat3_send_cmd(RR3_RC_DET_ENABLE, rr3); if (ret != 0) dev_dbg(dev, "%s: unexpected ret of %d\n", __func__, ret); ret = redrat3_send_cmd(RR3_RC_DET_STATUS, rr3); if (ret != 1) { dev_err(dev, "%s: detector status: %d, should be 1\n", __func__, ret); return -EIO; } ret = usb_submit_urb(rr3->narrow_urb, GFP_KERNEL); if (ret) { dev_err(rr3->dev, "narrow band urb failed: %d", ret); return ret; } ret = usb_submit_urb(rr3->wide_urb, GFP_KERNEL); if (ret) dev_err(rr3->dev, "wide band urb failed: %d", ret); return ret; } static inline void redrat3_delete(struct redrat3_dev *rr3, struct usb_device *udev) { usb_kill_urb(rr3->narrow_urb); usb_kill_urb(rr3->wide_urb); usb_kill_urb(rr3->flash_urb); usb_kill_urb(rr3->learn_urb); usb_free_urb(rr3->narrow_urb); usb_free_urb(rr3->wide_urb); usb_free_urb(rr3->flash_urb); usb_free_urb(rr3->learn_urb); usb_free_coherent(udev, le16_to_cpu(rr3->ep_narrow->wMaxPacketSize), rr3->bulk_in_buf, rr3->dma_in); kfree(rr3); } static u32 redrat3_get_timeout(struct redrat3_dev *rr3) { __be32 *tmp; u32 timeout = MS_TO_US(150); /* a sane default, if things go haywire */ int len, ret, pipe; len = sizeof(*tmp); tmp = kzalloc(len, GFP_KERNEL); if (!tmp) return timeout; pipe = usb_rcvctrlpipe(rr3->udev, 0); ret = usb_control_msg(rr3->udev, pipe, RR3_GET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, RR3_IR_IO_SIG_TIMEOUT, 0, tmp, len, 5000); if (ret != len) dev_warn(rr3->dev, "Failed to read timeout from hardware\n"); else { timeout = redrat3_len_to_us(be32_to_cpup(tmp)); dev_dbg(rr3->dev, "Got timeout of %d ms\n", timeout / 1000); } kfree(tmp); return timeout; } static int redrat3_set_timeout(struct rc_dev *rc_dev, unsigned int timeoutus) { struct redrat3_dev *rr3 = rc_dev->priv; struct usb_device *udev = rr3->udev; struct device *dev = rr3->dev; __be32 *timeout; int ret; timeout = kmalloc(sizeof(*timeout), GFP_KERNEL); if (!timeout) return -ENOMEM; *timeout = cpu_to_be32(redrat3_us_to_len(timeoutus)); ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_SIG_TIMEOUT, 0, timeout, sizeof(*timeout), 25000); dev_dbg(dev, "set ir parm timeout %d ret 0x%02x\n", be32_to_cpu(*timeout), ret); if (ret == sizeof(*timeout)) ret = 0; else if (ret >= 0) ret = -EIO; kfree(timeout); return ret; } static void redrat3_reset(struct redrat3_dev *rr3) { struct usb_device *udev = rr3->udev; struct device *dev = rr3->dev; int rc, rxpipe, txpipe; u8 *val; size_t const len = sizeof(*val); rxpipe = usb_rcvctrlpipe(udev, 0); txpipe = usb_sndctrlpipe(udev, 0); val = kmalloc(len, GFP_KERNEL); if (!val) return; *val = 0x01; rc = usb_control_msg(udev, rxpipe, RR3_RESET, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, RR3_CPUCS_REG_ADDR, 0, val, len, 25000); dev_dbg(dev, "reset returned 0x%02x\n", rc); *val = length_fuzz; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_LENGTH_FUZZ, 0, val, len, 25000); dev_dbg(dev, "set ir parm len fuzz %d rc 0x%02x\n", *val, rc); *val = (65536 - (minimum_pause * 2000)) / 256; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_MIN_PAUSE, 0, val, len, 25000); dev_dbg(dev, "set ir parm min pause %d rc 0x%02x\n", *val, rc); *val = periods_measure_carrier; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_PERIODS_MF, 0, val, len, 25000); dev_dbg(dev, "set ir parm periods measure carrier %d rc 0x%02x", *val, rc); *val = RR3_DRIVER_MAXLENS; rc = usb_control_msg(udev, txpipe, RR3_SET_IR_PARAM, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_OUT, RR3_IR_IO_MAX_LENGTHS, 0, val, len, 25000); dev_dbg(dev, "set ir parm max lens %d rc 0x%02x\n", *val, rc); kfree(val); } static void redrat3_get_firmware_rev(struct redrat3_dev *rr3) { int rc; char *buffer; buffer = kcalloc(RR3_FW_VERSION_LEN + 1, sizeof(*buffer), GFP_KERNEL); if (!buffer) return; rc = usb_control_msg(rr3->udev, usb_rcvctrlpipe(rr3->udev, 0), RR3_FW_VERSION, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0, 0, buffer, RR3_FW_VERSION_LEN, 5000); if (rc >= 0) dev_info(rr3->dev, "Firmware rev: %s", buffer); else dev_err(rr3->dev, "Problem fetching firmware ID\n"); kfree(buffer); } static void redrat3_read_packet_start(struct redrat3_dev *rr3, unsigned len) { struct redrat3_header *header = rr3->bulk_in_buf; unsigned pktlen, pkttype; /* grab the Length and type of transfer */ pktlen = be16_to_cpu(header->length); pkttype = be16_to_cpu(header->transfer_type); if (pktlen > sizeof(rr3->irdata)) { dev_warn(rr3->dev, "packet length %u too large\n", pktlen); return; } switch (pkttype) { case RR3_ERROR: if (len >= sizeof(struct redrat3_error)) { struct redrat3_error *error = rr3->bulk_in_buf; unsigned fw_error = be16_to_cpu(error->fw_error); redrat3_dump_fw_error(rr3, fw_error); } break; case RR3_MOD_SIGNAL_IN: memcpy(&rr3->irdata, rr3->bulk_in_buf, len); rr3->bytes_read = len; dev_dbg(rr3->dev, "bytes_read %d, pktlen %d\n", rr3->bytes_read, pktlen); break; default: dev_dbg(rr3->dev, "ignoring packet with type 0x%02x, len of %d, 0x%02x\n", pkttype, len, pktlen); break; } } static void redrat3_read_packet_continue(struct redrat3_dev *rr3, unsigned len) { void *irdata = &rr3->irdata; if (len + rr3->bytes_read > sizeof(rr3->irdata)) { dev_warn(rr3->dev, "too much data for packet\n"); rr3->bytes_read = 0; return; } memcpy(irdata + rr3->bytes_read, rr3->bulk_in_buf, len); rr3->bytes_read += len; dev_dbg(rr3->dev, "bytes_read %d, pktlen %d\n", rr3->bytes_read, be16_to_cpu(rr3->irdata.header.length)); } /* gather IR data from incoming urb, process it when we have enough */ static int redrat3_get_ir_data(struct redrat3_dev *rr3, unsigned len) { struct device *dev = rr3->dev; unsigned pkttype; int ret = 0; if (rr3->bytes_read == 0 && len >= sizeof(struct redrat3_header)) { redrat3_read_packet_start(rr3, len); } else if (rr3->bytes_read != 0) { redrat3_read_packet_continue(rr3, len); } else if (rr3->bytes_read == 0) { dev_err(dev, "error: no packet data read\n"); ret = -ENODATA; goto out; } if (rr3->bytes_read < be16_to_cpu(rr3->irdata.header.length) + sizeof(struct redrat3_header)) /* we're still accumulating data */ return 0; /* if we get here, we've got IR data to decode */ pkttype = be16_to_cpu(rr3->irdata.header.transfer_type); if (pkttype == RR3_MOD_SIGNAL_IN) redrat3_process_ir_data(rr3); else dev_dbg(dev, "discarding non-signal data packet (type 0x%02x)\n", pkttype); out: rr3->bytes_read = 0; return ret; } /* callback function from USB when async USB request has completed */ static void redrat3_handle_async(struct urb *urb) { struct redrat3_dev *rr3 = urb->context; int ret; switch (urb->status) { case 0: ret = redrat3_get_ir_data(rr3, urb->actual_length); if (!ret && rr3->wideband && !rr3->learn_urb->hcpriv) { ret = usb_submit_urb(rr3->learn_urb, GFP_ATOMIC); if (ret) dev_err(rr3->dev, "Failed to submit learning urb: %d", ret); } if (!ret) { /* no error, prepare to read more */ ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret) dev_err(rr3->dev, "Failed to resubmit urb: %d", ret); } break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: usb_unlink_urb(urb); return; case -EPIPE: default: dev_warn(rr3->dev, "Error: urb status = %d\n", urb->status); rr3->bytes_read = 0; break; } } static u16 mod_freq_to_val(unsigned int mod_freq) { int mult = 6000000; /* Clk used in mod. freq. generation is CLK24/4. */ return 65536 - (mult / mod_freq); } static int redrat3_set_tx_carrier(struct rc_dev *rcdev, u32 carrier) { struct redrat3_dev *rr3 = rcdev->priv; struct device *dev = rr3->dev; dev_dbg(dev, "Setting modulation frequency to %u", carrier); if (carrier == 0) return -EINVAL; rr3->carrier = carrier; return 0; } static int redrat3_transmit_ir(struct rc_dev *rcdev, unsigned *txbuf, unsigned count) { struct redrat3_dev *rr3 = rcdev->priv; struct device *dev = rr3->dev; struct redrat3_irdata *irdata = NULL; int ret, ret_len; int lencheck, cur_sample_len, pipe; int *sample_lens = NULL; u8 curlencheck = 0; unsigned i, sendbuf_len; if (rr3->transmitting) { dev_warn(dev, "%s: transmitter already in use\n", __func__); return -EAGAIN; } if (count > RR3_MAX_SIG_SIZE - RR3_TX_TRAILER_LEN) return -EINVAL; /* rr3 will disable rc detector on transmit */ rr3->transmitting = true; sample_lens = kcalloc(RR3_DRIVER_MAXLENS, sizeof(*sample_lens), GFP_KERNEL); if (!sample_lens) return -ENOMEM; irdata = kzalloc(sizeof(*irdata), GFP_KERNEL); if (!irdata) { ret = -ENOMEM; goto out; } for (i = 0; i < count; i++) { cur_sample_len = redrat3_us_to_len(txbuf[i]); if (cur_sample_len > 0xffff) { dev_warn(dev, "transmit period of %uus truncated to %uus\n", txbuf[i], redrat3_len_to_us(0xffff)); cur_sample_len = 0xffff; } for (lencheck = 0; lencheck < curlencheck; lencheck++) { if (sample_lens[lencheck] == cur_sample_len) break; } if (lencheck == curlencheck) { dev_dbg(dev, "txbuf[%d]=%u, pos %d, enc %u\n", i, txbuf[i], curlencheck, cur_sample_len); if (curlencheck < RR3_DRIVER_MAXLENS) { /* now convert the value to a proper * rr3 value.. */ sample_lens[curlencheck] = cur_sample_len; put_unaligned_be16(cur_sample_len, &irdata->lens[curlencheck]); curlencheck++; } else { ret = -EINVAL; goto out; } } irdata->sigdata[i] = lencheck; } irdata->sigdata[count] = RR3_END_OF_SIGNAL; irdata->sigdata[count + 1] = RR3_END_OF_SIGNAL; sendbuf_len = offsetof(struct redrat3_irdata, sigdata[count + RR3_TX_TRAILER_LEN]); /* fill in our packet header */ irdata->header.length = cpu_to_be16(sendbuf_len - sizeof(struct redrat3_header)); irdata->header.transfer_type = cpu_to_be16(RR3_MOD_SIGNAL_OUT); irdata->pause = cpu_to_be32(redrat3_len_to_us(100)); irdata->mod_freq_count = cpu_to_be16(mod_freq_to_val(rr3->carrier)); irdata->no_lengths = curlencheck; irdata->sig_size = cpu_to_be16(count + RR3_TX_TRAILER_LEN); pipe = usb_sndbulkpipe(rr3->udev, rr3->ep_out->bEndpointAddress); ret = usb_bulk_msg(rr3->udev, pipe, irdata, sendbuf_len, &ret_len, 10000); dev_dbg(dev, "sent %d bytes, (ret %d)\n", ret_len, ret); /* now tell the hardware to transmit what we sent it */ pipe = usb_rcvctrlpipe(rr3->udev, 0); ret = usb_control_msg(rr3->udev, pipe, RR3_TX_SEND_SIGNAL, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0, 0, irdata, 2, 10000); if (ret < 0) dev_err(dev, "Error: control msg send failed, rc %d\n", ret); else ret = count; out: kfree(irdata); kfree(sample_lens); rr3->transmitting = false; /* rr3 re-enables rc detector because it was enabled before */ return ret; } static void redrat3_brightness_set(struct led_classdev *led_dev, enum led_brightness brightness) { struct redrat3_dev *rr3 = container_of(led_dev, struct redrat3_dev, led); if (brightness != LED_OFF && atomic_cmpxchg(&rr3->flash, 0, 1) == 0) { int ret = usb_submit_urb(rr3->flash_urb, GFP_ATOMIC); if (ret != 0) { dev_dbg(rr3->dev, "%s: unexpected ret of %d\n", __func__, ret); atomic_set(&rr3->flash, 0); } } } static int redrat3_wideband_receiver(struct rc_dev *rcdev, int enable) { struct redrat3_dev *rr3 = rcdev->priv; int ret = 0; rr3->wideband = enable != 0; if (enable) { ret = usb_submit_urb(rr3->learn_urb, GFP_KERNEL); if (ret) dev_err(rr3->dev, "Failed to submit learning urb: %d", ret); } return ret; } static void redrat3_learn_complete(struct urb *urb) { struct redrat3_dev *rr3 = urb->context; switch (urb->status) { case 0: break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: usb_unlink_urb(urb); return; case -EPIPE: default: dev_err(rr3->dev, "Error: learn urb status = %d", urb->status); break; } } static void redrat3_led_complete(struct urb *urb) { struct redrat3_dev *rr3 = urb->context; switch (urb->status) { case 0: break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: usb_unlink_urb(urb); return; case -EPIPE: default: dev_dbg(rr3->dev, "Error: urb status = %d\n", urb->status); break; } rr3->led.brightness = LED_OFF; atomic_dec(&rr3->flash); } static struct rc_dev *redrat3_init_rc_dev(struct redrat3_dev *rr3) { struct device *dev = rr3->dev; struct rc_dev *rc; int ret; u16 prod = le16_to_cpu(rr3->udev->descriptor.idProduct); rc = rc_allocate_device(RC_DRIVER_IR_RAW); if (!rc) return NULL; snprintf(rr3->name, sizeof(rr3->name), "RedRat3%s Infrared Remote Transceiver", prod == USB_RR3IIUSB_PRODUCT_ID ? "-II" : ""); usb_make_path(rr3->udev, rr3->phys, sizeof(rr3->phys)); rc->device_name = rr3->name; rc->input_phys = rr3->phys; usb_to_input_id(rr3->udev, &rc->input_id); rc->dev.parent = dev; rc->priv = rr3; rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER; rc->min_timeout = MS_TO_US(RR3_RX_MIN_TIMEOUT); rc->max_timeout = MS_TO_US(RR3_RX_MAX_TIMEOUT); rc->timeout = redrat3_get_timeout(rr3); rc->s_timeout = redrat3_set_timeout; rc->tx_ir = redrat3_transmit_ir; rc->s_tx_carrier = redrat3_set_tx_carrier; rc->s_carrier_report = redrat3_wideband_receiver; rc->driver_name = DRIVER_NAME; rc->rx_resolution = 2; rc->map_name = RC_MAP_HAUPPAUGE; ret = rc_register_device(rc); if (ret < 0) { dev_err(dev, "remote dev registration failed\n"); goto out; } return rc; out: rc_free_device(rc); return NULL; } static int redrat3_dev_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct device *dev = &intf->dev; struct usb_host_interface *uhi; struct redrat3_dev *rr3; struct usb_endpoint_descriptor *ep; struct usb_endpoint_descriptor *ep_narrow = NULL; struct usb_endpoint_descriptor *ep_wide = NULL; struct usb_endpoint_descriptor *ep_out = NULL; u8 addr, attrs; int pipe, i; int retval = -ENOMEM; uhi = intf->cur_altsetting; /* find our bulk-in and bulk-out endpoints */ for (i = 0; i < uhi->desc.bNumEndpoints; ++i) { ep = &uhi->endpoint[i].desc; addr = ep->bEndpointAddress; attrs = ep->bmAttributes; if (((addr & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN) && ((attrs & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_BULK)) { dev_dbg(dev, "found bulk-in endpoint at 0x%02x\n", ep->bEndpointAddress); /* data comes in on 0x82, 0x81 is for learning */ if (ep->bEndpointAddress == RR3_NARROW_IN_EP_ADDR) ep_narrow = ep; if (ep->bEndpointAddress == RR3_WIDE_IN_EP_ADDR) ep_wide = ep; } if ((ep_out == NULL) && ((addr & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT) && ((attrs & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_BULK)) { dev_dbg(dev, "found bulk-out endpoint at 0x%02x\n", ep->bEndpointAddress); ep_out = ep; } } if (!ep_narrow || !ep_out || !ep_wide) { dev_err(dev, "Couldn't find all endpoints\n"); retval = -ENODEV; goto no_endpoints; } /* allocate memory for our device state and initialize it */ rr3 = kzalloc(sizeof(*rr3), GFP_KERNEL); if (!rr3) goto no_endpoints; rr3->dev = &intf->dev; rr3->ep_narrow = ep_narrow; rr3->ep_out = ep_out; rr3->udev = udev; /* set up bulk-in endpoint */ rr3->narrow_urb = usb_alloc_urb(0, GFP_KERNEL); if (!rr3->narrow_urb) goto redrat_free; rr3->wide_urb = usb_alloc_urb(0, GFP_KERNEL); if (!rr3->wide_urb) goto redrat_free; rr3->bulk_in_buf = usb_alloc_coherent(udev, le16_to_cpu(ep_narrow->wMaxPacketSize), GFP_KERNEL, &rr3->dma_in); if (!rr3->bulk_in_buf) goto redrat_free; pipe = usb_rcvbulkpipe(udev, ep_narrow->bEndpointAddress); usb_fill_bulk_urb(rr3->narrow_urb, udev, pipe, rr3->bulk_in_buf, le16_to_cpu(ep_narrow->wMaxPacketSize), redrat3_handle_async, rr3); rr3->narrow_urb->transfer_dma = rr3->dma_in; rr3->narrow_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; pipe = usb_rcvbulkpipe(udev, ep_wide->bEndpointAddress); usb_fill_bulk_urb(rr3->wide_urb, udev, pipe, rr3->bulk_in_buf, le16_to_cpu(ep_narrow->wMaxPacketSize), redrat3_handle_async, rr3); rr3->wide_urb->transfer_dma = rr3->dma_in; rr3->wide_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; redrat3_reset(rr3); redrat3_get_firmware_rev(rr3); /* default.. will get overridden by any sends with a freq defined */ rr3->carrier = 38000; atomic_set(&rr3->flash, 0); rr3->flash_urb = usb_alloc_urb(0, GFP_KERNEL); if (!rr3->flash_urb) goto redrat_free; /* learn urb */ rr3->learn_urb = usb_alloc_urb(0, GFP_KERNEL); if (!rr3->learn_urb) goto redrat_free; /* setup packet is 'c0 b2 0000 0000 0001' */ rr3->learn_control.bRequestType = 0xc0; rr3->learn_control.bRequest = RR3_MODSIG_CAPTURE; rr3->learn_control.wLength = cpu_to_le16(1); usb_fill_control_urb(rr3->learn_urb, udev, usb_rcvctrlpipe(udev, 0), (unsigned char *)&rr3->learn_control, &rr3->learn_buf, sizeof(rr3->learn_buf), redrat3_learn_complete, rr3); /* setup packet is 'c0 b9 0000 0000 0001' */ rr3->flash_control.bRequestType = 0xc0; rr3->flash_control.bRequest = RR3_BLINK_LED; rr3->flash_control.wLength = cpu_to_le16(1); usb_fill_control_urb(rr3->flash_urb, udev, usb_rcvctrlpipe(udev, 0), (unsigned char *)&rr3->flash_control, &rr3->flash_in_buf, sizeof(rr3->flash_in_buf), redrat3_led_complete, rr3); /* led control */ rr3->led.name = "redrat3:red:feedback"; rr3->led.default_trigger = "rc-feedback"; rr3->led.brightness_set = redrat3_brightness_set; retval = led_classdev_register(&intf->dev, &rr3->led); if (retval) goto redrat_free; rr3->rc = redrat3_init_rc_dev(rr3); if (!rr3->rc) { retval = -ENOMEM; goto led_free; } /* might be all we need to do? */ retval = redrat3_enable_detector(rr3); if (retval < 0) goto led_free; /* we can register the device now, as it is ready */ usb_set_intfdata(intf, rr3); return 0; led_free: led_classdev_unregister(&rr3->led); redrat_free: redrat3_delete(rr3, rr3->udev); no_endpoints: return retval; } static void redrat3_dev_disconnect(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); struct redrat3_dev *rr3 = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); rc_unregister_device(rr3->rc); led_classdev_unregister(&rr3->led); redrat3_delete(rr3, udev); } static int redrat3_dev_suspend(struct usb_interface *intf, pm_message_t message) { struct redrat3_dev *rr3 = usb_get_intfdata(intf); led_classdev_suspend(&rr3->led); usb_kill_urb(rr3->narrow_urb); usb_kill_urb(rr3->wide_urb); usb_kill_urb(rr3->flash_urb); return 0; } static int redrat3_dev_resume(struct usb_interface *intf) { struct redrat3_dev *rr3 = usb_get_intfdata(intf); if (usb_submit_urb(rr3->narrow_urb, GFP_NOIO)) return -EIO; if (usb_submit_urb(rr3->wide_urb, GFP_NOIO)) return -EIO; led_classdev_resume(&rr3->led); return 0; } static struct usb_driver redrat3_dev_driver = { .name = DRIVER_NAME, .probe = redrat3_dev_probe, .disconnect = redrat3_dev_disconnect, .suspend = redrat3_dev_suspend, .resume = redrat3_dev_resume, .reset_resume = redrat3_dev_resume, .id_table = redrat3_dev_table }; module_usb_driver(redrat3_dev_driver); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_AUTHOR(DRIVER_AUTHOR2); MODULE_LICENSE("GPL"); MODULE_DEVICE_TABLE(usb, redrat3_dev_table);
6 6 5 5 5 4 4 4 4 4 4 4 1 6 4 4 3 2 2 2 2 2 2 2 4 2 2 1 1 1 2 6 6 5 3 3 2 3 1 2 2 2 2 1 5 4 4 3 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 3 3 2 1 1 1 1 1 1 1 1 1 1 3 4 4 3 2 1 1 1 1 1 3 2 2 2 2 2 1 1 1 1 16 16 16 1 1 18 18 17 6 5 11 16 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/nfc.h> #include <linux/sched/signal.h> #include "nfc.h" #include "llcp.h" static int sock_wait_state(struct sock *sk, int state, unsigned long timeo) { DECLARE_WAITQUEUE(wait, current); int err = 0; pr_debug("sk %p", sk); add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); while (sk->sk_state != state) { if (!timeo) { err = -EINPROGRESS; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); set_current_state(TASK_INTERRUPTIBLE); err = sock_error(sk); if (err) break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; } static struct proto llcp_sock_proto = { .name = "NFC_LLCP", .owner = THIS_MODULE, .obj_size = sizeof(struct nfc_llcp_sock), }; static int llcp_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_local *local; struct nfc_dev *dev; struct sockaddr_nfc_llcp llcp_addr; int len, ret = 0; if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_NFC) return -EINVAL; pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family); memset(&llcp_addr, 0, sizeof(llcp_addr)); len = min_t(unsigned int, sizeof(llcp_addr), alen); memcpy(&llcp_addr, addr, len); /* This is going to be a listening socket, dsap must be 0 */ if (llcp_addr.dsap != 0) return -EINVAL; lock_sock(sk); if (sk->sk_state != LLCP_CLOSED) { ret = -EBADFD; goto error; } dev = nfc_get_device(llcp_addr.dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->nfc_protocol = llcp_addr.nfc_protocol; llcp_sock->service_name_len = min_t(unsigned int, llcp_addr.service_name_len, NFC_LLCP_MAX_SERVICE_NAME); llcp_sock->service_name = kmemdup(llcp_addr.service_name, llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { ret = -ENOMEM; goto sock_llcp_put_local; } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { ret = -EADDRINUSE; goto free_service_name; } llcp_sock->reserved_ssap = llcp_sock->ssap; nfc_llcp_sock_link(&local->sockets, sk); pr_debug("Socket bound to SAP %d\n", llcp_sock->ssap); sk->sk_state = LLCP_BOUND; nfc_put_device(dev); release_sock(sk); return 0; free_service_name: kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; sock_llcp_put_local: nfc_llcp_local_put(llcp_sock->local); llcp_sock->local = NULL; llcp_sock->dev = NULL; put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_local *local; struct nfc_dev *dev; struct sockaddr_nfc_llcp llcp_addr; int len, ret = 0; if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_NFC) return -EINVAL; pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family); memset(&llcp_addr, 0, sizeof(llcp_addr)); len = min_t(unsigned int, sizeof(llcp_addr), alen); memcpy(&llcp_addr, addr, len); lock_sock(sk); if (sk->sk_state != LLCP_CLOSED) { ret = -EBADFD; goto error; } dev = nfc_get_device(llcp_addr.dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->nfc_protocol = llcp_addr.nfc_protocol; nfc_llcp_sock_link(&local->raw_sockets, sk); sk->sk_state = LLCP_BOUND; put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int ret = 0; pr_debug("sk %p backlog %d\n", sk, backlog); lock_sock(sk); if ((sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM) || sk->sk_state != LLCP_BOUND) { ret = -EBADFD; goto error; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; pr_debug("Socket listening\n"); sk->sk_state = LLCP_LISTEN; error: release_sock(sk); return ret; } static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); u32 opt; int err = 0; pr_debug("%p optname %d\n", sk, optname); if (level != SOL_NFC) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case NFC_LLCP_RW: if (sk->sk_state == LLCP_CONNECTED || sk->sk_state == LLCP_BOUND || sk->sk_state == LLCP_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt > LLCP_MAX_RW) { err = -EINVAL; break; } llcp_sock->rw = (u8) opt; break; case NFC_LLCP_MIUX: if (sk->sk_state == LLCP_CONNECTED || sk->sk_state == LLCP_BOUND || sk->sk_state == LLCP_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt > LLCP_MAX_MIUX) { err = -EINVAL; break; } llcp_sock->miux = cpu_to_be16((u16) opt); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); pr_debug("%p rw %d miux %d\n", llcp_sock, llcp_sock->rw, llcp_sock->miux); return err; } static int nfc_llcp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct nfc_llcp_local *local; struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int len, err = 0; u16 miux, remote_miu; u8 rw; pr_debug("%p optname %d\n", sk, optname); if (level != SOL_NFC) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; local = llcp_sock->local; if (!local) return -ENODEV; len = min_t(u32, len, sizeof(u32)); lock_sock(sk); switch (optname) { case NFC_LLCP_RW: rw = llcp_sock->rw > LLCP_MAX_RW ? local->rw : llcp_sock->rw; if (put_user(rw, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_MIUX: miux = be16_to_cpu(llcp_sock->miux) > LLCP_MAX_MIUX ? be16_to_cpu(local->miux) : be16_to_cpu(llcp_sock->miux); if (put_user(miux, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_MIU: remote_miu = llcp_sock->remote_miu > LLCP_MAX_MIU ? local->remote_miu : llcp_sock->remote_miu; if (put_user(remote_miu, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_LTO: if (put_user(local->remote_lto / 10, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_RW: if (put_user(llcp_sock->remote_rw, (u32 __user *) optval)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); if (put_user(len, optlen)) return -EFAULT; return err; } void nfc_llcp_accept_unlink(struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); pr_debug("state %d\n", sk->sk_state); list_del_init(&llcp_sock->accept_queue); sk_acceptq_removed(llcp_sock->parent); llcp_sock->parent = NULL; sock_put(sk); } void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_sock *llcp_sock_parent = nfc_llcp_sock(parent); /* Lock will be free from unlink */ sock_hold(sk); list_add_tail(&llcp_sock->accept_queue, &llcp_sock_parent->accept_queue); llcp_sock->parent = parent; sk_acceptq_added(parent); } struct sock *nfc_llcp_accept_dequeue(struct sock *parent, struct socket *newsock) { struct nfc_llcp_sock *lsk, *n, *llcp_parent; struct sock *sk; llcp_parent = nfc_llcp_sock(parent); list_for_each_entry_safe(lsk, n, &llcp_parent->accept_queue, accept_queue) { sk = &lsk->sk; lock_sock(sk); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_accept_unlink(sk); continue; } if (sk->sk_state == LLCP_CONNECTED || !newsock) { list_del_init(&lsk->accept_queue); sock_put(sk); if (newsock) sock_graft(sk, newsock); release_sock(sk); pr_debug("Returning sk state %d\n", sk->sk_state); sk_acceptq_removed(parent); return sk; } release_sock(sk); } return NULL; } static int llcp_sock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { DECLARE_WAITQUEUE(wait, current); struct sock *sk = sock->sk, *new_sk; long timeo; int ret = 0; pr_debug("parent %p\n", sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_state != LLCP_LISTEN) { ret = -EBADFD; goto error; } timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* Wait for an incoming connection. */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(new_sk = nfc_llcp_accept_dequeue(sk, newsock))) { set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); if (ret) goto error; newsock->state = SS_CONNECTED; pr_debug("new socket %p\n", new_sk); error: release_sock(sk); return ret; } static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, llcp_addr, uaddr); if (llcp_sock == NULL || llcp_sock->dev == NULL) return -EBADFD; pr_debug("%p %d %d %d\n", sk, llcp_sock->target_idx, llcp_sock->dsap, llcp_sock->ssap); memset(llcp_addr, 0, sizeof(*llcp_addr)); lock_sock(sk); if (!llcp_sock->dev) { release_sock(sk); return -EBADFD; } llcp_addr->sa_family = AF_NFC; llcp_addr->dev_idx = llcp_sock->dev->idx; llcp_addr->target_idx = llcp_sock->target_idx; llcp_addr->nfc_protocol = llcp_sock->nfc_protocol; llcp_addr->dsap = llcp_sock->dsap; llcp_addr->ssap = llcp_sock->ssap; llcp_addr->service_name_len = llcp_sock->service_name_len; memcpy(llcp_addr->service_name, llcp_sock->service_name, llcp_addr->service_name_len); release_sock(sk); return sizeof(struct sockaddr_nfc_llcp); } static inline __poll_t llcp_accept_poll(struct sock *parent) { struct nfc_llcp_sock *llcp_sock, *parent_sock; struct sock *sk; parent_sock = nfc_llcp_sock(parent); list_for_each_entry(llcp_sock, &parent_sock->accept_queue, accept_queue) { sk = &llcp_sock->sk; if (sk->sk_state == LLCP_CONNECTED) return EPOLLIN | EPOLLRDNORM; } return 0; } static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; pr_debug("%p\n", sk); sock_poll_wait(file, sock, wait); if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == LLCP_CLOSED) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); pr_debug("mask 0x%x\n", mask); return mask; } static int llcp_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct nfc_llcp_local *local; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int err = 0; if (!sk) return 0; pr_debug("%p\n", sk); local = llcp_sock->local; if (local == NULL) { err = -ENODEV; goto out; } lock_sock(sk); /* Send a DISC */ if (sk->sk_state == LLCP_CONNECTED) nfc_llcp_send_disconnect(llcp_sock); if (sk->sk_state == LLCP_LISTEN) { struct nfc_llcp_sock *lsk, *n; struct sock *accept_sk; list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue, accept_queue) { accept_sk = &lsk->sk; lock_sock(accept_sk); nfc_llcp_send_disconnect(lsk); nfc_llcp_accept_unlink(accept_sk); release_sock(accept_sk); } } if (sock->type == SOCK_RAW) nfc_llcp_sock_unlink(&local->raw_sockets, sk); else nfc_llcp_sock_unlink(&local->sockets, sk); if (llcp_sock->reserved_ssap < LLCP_SAP_MAX) nfc_llcp_put_ssap(llcp_sock->local, llcp_sock->ssap); release_sock(sk); out: sock_orphan(sk); sock_put(sk); return err; } static int llcp_sock_connect(struct socket *sock, struct sockaddr_unsized *_addr, int len, int flags) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct sockaddr_nfc_llcp *addr = (struct sockaddr_nfc_llcp *)_addr; struct nfc_dev *dev; struct nfc_llcp_local *local; int ret = 0; pr_debug("sock %p sk %p flags 0x%x\n", sock, sk, flags); if (!addr || len < sizeof(*addr) || addr->sa_family != AF_NFC) return -EINVAL; if (addr->service_name_len == 0 && addr->dsap == 0) return -EINVAL; pr_debug("addr dev_idx=%u target_idx=%u protocol=%u\n", addr->dev_idx, addr->target_idx, addr->nfc_protocol); lock_sock(sk); if (sk->sk_state == LLCP_CONNECTED) { ret = -EISCONN; goto error; } if (sk->sk_state == LLCP_CONNECTING) { ret = -EINPROGRESS; goto error; } dev = nfc_get_device(addr->dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } device_lock(&dev->dev); if (dev->dep_link_up == false) { ret = -ENOLINK; device_unlock(&dev->dev); goto sock_llcp_put_local; } device_unlock(&dev->dev); if (local->rf_mode == NFC_RF_INITIATOR && addr->target_idx != local->target_idx) { ret = -ENOLINK; goto sock_llcp_put_local; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { ret = -ENOMEM; goto sock_llcp_nullify; } llcp_sock->reserved_ssap = llcp_sock->ssap; if (addr->service_name_len == 0) llcp_sock->dsap = addr->dsap; else llcp_sock->dsap = LLCP_SAP_SDP; llcp_sock->nfc_protocol = addr->nfc_protocol; llcp_sock->service_name_len = min_t(unsigned int, addr->service_name_len, NFC_LLCP_MAX_SERVICE_NAME); llcp_sock->service_name = kmemdup(addr->service_name, llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { ret = -ENOMEM; goto sock_llcp_release; } nfc_llcp_sock_link(&local->connecting_sockets, sk); ret = nfc_llcp_send_connect(llcp_sock); if (ret) goto sock_unlink; sk->sk_state = LLCP_CONNECTING; ret = sock_wait_state(sk, LLCP_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); if (ret && ret != -EINPROGRESS) goto sock_unlink; release_sock(sk); return ret; sock_unlink: nfc_llcp_sock_unlink(&local->connecting_sockets, sk); kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); sock_llcp_nullify: llcp_sock->local = NULL; llcp_sock->dev = NULL; sock_llcp_put_local: nfc_llcp_local_put(local); put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int ret; pr_debug("sock %p sk %p", sock, sk); ret = sock_error(sk); if (ret) return ret; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); if (!llcp_sock->local) { release_sock(sk); return -ENODEV; } if (sk->sk_type == SOCK_DGRAM) { if (sk->sk_state != LLCP_BOUND) { release_sock(sk); return -ENOTCONN; } DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, addr, msg->msg_name); if (msg->msg_namelen < sizeof(*addr)) { release_sock(sk); return -EINVAL; } release_sock(sk); return nfc_llcp_send_ui_frame(llcp_sock, addr->dsap, addr->ssap, msg, len); } if (sk->sk_state != LLCP_CONNECTED) { release_sock(sk); return -ENOTCONN; } release_sock(sk); return nfc_llcp_send_i_frame(llcp_sock, msg, len); } static int llcp_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; unsigned int copied, rlen; struct sk_buff *skb, *cskb; int err = 0; pr_debug("%p %zu\n", sk, len); lock_sock(sk); if (sk->sk_state == LLCP_CLOSED && skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); return 0; } release_sock(sk); if (flags & (MSG_OOB)) return -EOPNOTSUPP; skb = skb_recv_datagram(sk, flags, &err); if (!skb) { pr_err("Recv datagram failed state %d %d %d", sk->sk_state, err, sock_error(sk)); if (sk->sk_shutdown & RCV_SHUTDOWN) return 0; return err; } rlen = skb->len; /* real length of skb */ copied = min_t(unsigned int, rlen, len); cskb = skb; if (skb_copy_datagram_msg(cskb, 0, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; } sock_recv_timestamp(msg, sk, skb); if (sk->sk_type == SOCK_DGRAM && msg->msg_name) { struct nfc_llcp_ui_cb *ui_cb = nfc_llcp_ui_skb_cb(skb); DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, sockaddr, msg->msg_name); msg->msg_namelen = sizeof(struct sockaddr_nfc_llcp); pr_debug("Datagram socket %d %d\n", ui_cb->dsap, ui_cb->ssap); memset(sockaddr, 0, sizeof(*sockaddr)); sockaddr->sa_family = AF_NFC; sockaddr->nfc_protocol = NFC_PROTO_NFC_DEP; sockaddr->dsap = ui_cb->dsap; sockaddr->ssap = ui_cb->ssap; } /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { /* SOCK_STREAM: re-queue skb if it contains unreceived data */ if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { skb_pull(skb, copied); if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); goto done; } } kfree_skb(skb); } /* XXX Queue backlogged skbs */ done: /* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */ if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC)) copied = rlen; return copied; } static const struct proto_ops llcp_sock_ops = { .family = PF_NFC, .owner = THIS_MODULE, .bind = llcp_sock_bind, .connect = llcp_sock_connect, .release = llcp_sock_release, .socketpair = sock_no_socketpair, .accept = llcp_sock_accept, .getname = llcp_sock_getname, .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = llcp_sock_listen, .shutdown = sock_no_shutdown, .setsockopt = nfc_llcp_setsockopt, .getsockopt = nfc_llcp_getsockopt, .sendmsg = llcp_sock_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops llcp_rawsock_ops = { .family = PF_NFC, .owner = THIS_MODULE, .bind = llcp_raw_sock_bind, .connect = sock_no_connect, .release = llcp_sock_release, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = llcp_sock_getname, .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = sock_no_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, }; static void llcp_sock_destruct(struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); pr_debug("%p\n", sk); if (sk->sk_state == LLCP_CONNECTED) nfc_put_device(llcp_sock->dev); skb_queue_purge(&sk->sk_receive_queue); nfc_llcp_sock_free(llcp_sock); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Freeing alive NFC LLCP socket %p\n", sk); return; } } struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern); if (!sk) return NULL; llcp_sock = nfc_llcp_sock(sk); sock_init_data(sock, sk); sk->sk_state = LLCP_CLOSED; sk->sk_protocol = NFC_SOCKPROTO_LLCP; sk->sk_type = type; sk->sk_destruct = llcp_sock_destruct; llcp_sock->ssap = 0; llcp_sock->dsap = LLCP_SAP_SDP; llcp_sock->rw = LLCP_MAX_RW + 1; llcp_sock->miux = cpu_to_be16(LLCP_MAX_MIUX + 1); llcp_sock->send_n = llcp_sock->send_ack_n = 0; llcp_sock->recv_n = llcp_sock->recv_ack_n = 0; llcp_sock->remote_ready = 1; llcp_sock->reserved_ssap = LLCP_SAP_MAX; nfc_llcp_socket_remote_param_init(llcp_sock); skb_queue_head_init(&llcp_sock->tx_queue); skb_queue_head_init(&llcp_sock->tx_pending_queue); INIT_LIST_HEAD(&llcp_sock->accept_queue); if (sock != NULL) sock->state = SS_UNCONNECTED; return sk; } void nfc_llcp_sock_free(struct nfc_llcp_sock *sock) { kfree(sock->service_name); skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); list_del_init(&sock->accept_queue); sock->parent = NULL; nfc_llcp_local_put(sock->local); } static int llcp_sock_create(struct net *net, struct socket *sock, const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; pr_debug("%p\n", sock); if (sock->type != SOCK_STREAM && sock->type != SOCK_DGRAM && sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (sock->type == SOCK_RAW) { if (!capable(CAP_NET_RAW)) return -EPERM; sock->ops = &llcp_rawsock_ops; } else { sock->ops = &llcp_sock_ops; } sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) return -ENOMEM; return 0; } static const struct nfc_protocol llcp_nfc_proto = { .id = NFC_SOCKPROTO_LLCP, .proto = &llcp_sock_proto, .owner = THIS_MODULE, .create = llcp_sock_create }; int __init nfc_llcp_sock_init(void) { return nfc_proto_register(&llcp_nfc_proto); } void nfc_llcp_sock_exit(void) { nfc_proto_unregister(&llcp_nfc_proto); }
1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 // SPDX-License-Identifier: GPL-2.0-or-later /****************************************************************************** * vlanproc.c VLAN Module. /proc filesystem interface. * * This module is completely hardware-independent and provides * access to the router using Linux /proc filesystem. * * Author: Ben Greear, <greearb@candelatech.com> coppied from wanproc.c * by: Gene Kozin <genek@compuserve.com> * * Copyright: (c) 1998 Ben Greear * * ============================================================================ * Jan 20, 1998 Ben Greear Initial Version *****************************************************************************/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "vlanproc.h" #include "vlan.h" /****** Function Prototypes *************************************************/ /* Methods for preparing data for reading proc entries */ static int vlan_seq_show(struct seq_file *seq, void *v); static void *vlan_seq_start(struct seq_file *seq, loff_t *pos); static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos); static void vlan_seq_stop(struct seq_file *seq, void *); static int vlandev_seq_show(struct seq_file *seq, void *v); /* * Global Data */ /* * Names of the proc directory entries */ static const char name_root[] = "vlan"; static const char name_conf[] = "config"; /* * Structures for interfacing with the /proc filesystem. * VLAN creates its own directory /proc/net/vlan with the following * entries: * config device status/configuration * <device> entry for each device */ /* * Generic /proc/net/vlan/<file> file and inode operations */ static const struct seq_operations vlan_seq_ops = { .start = vlan_seq_start, .next = vlan_seq_next, .stop = vlan_seq_stop, .show = vlan_seq_show, }; /* * Proc filesystem directory entries. */ /* Strings */ static const char *const vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = { [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID", [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD", [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD", [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID", }; /* * Interface functions */ /* * Clean up /proc/net/vlan entries */ void vlan_proc_cleanup(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); if (vn->proc_vlan_conf) remove_proc_entry(name_conf, vn->proc_vlan_dir); if (vn->proc_vlan_dir) remove_proc_entry(name_root, net->proc_net); /* Dynamically added entries should be cleaned up as their vlan_device * is removed, so we should not have to take care of it here... */ } /* * Create /proc/net/vlan entries */ int __net_init vlan_proc_init(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net); if (!vn->proc_vlan_dir) goto err; vn->proc_vlan_conf = proc_create_net(name_conf, S_IFREG | 0600, vn->proc_vlan_dir, &vlan_seq_ops, sizeof(struct seq_net_private)); if (!vn->proc_vlan_conf) goto err; return 0; err: pr_err("can't create entry in proc filesystem!\n"); vlan_proc_cleanup(net); return -ENOBUFS; } /* * Add directory entry for VLAN device. */ int vlan_proc_add_dev(struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id); if (!strcmp(vlandev->name, name_conf)) return -EINVAL; vlan->dent = proc_create_single_data(vlandev->name, S_IFREG | 0600, vn->proc_vlan_dir, vlandev_seq_show, vlandev); if (!vlan->dent) return -ENOBUFS; return 0; } /* * Delete directory entry for VLAN device. */ void vlan_proc_rem_dev(struct net_device *vlandev) { /** NOTE: This will consume the memory pointed to by dent, it seems. */ proc_remove(vlan_dev_priv(vlandev)->dent); vlan_dev_priv(vlandev)->dent = NULL; } /****** Proc filesystem entry points ****************************************/ /* * The following few functions build the content of /proc/net/vlan/config */ static void *vlan_seq_from_index(struct seq_file *seq, loff_t *pos) { unsigned long ifindex = *pos; struct net_device *dev; for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { if (!is_vlan_dev(dev)) continue; *pos = dev->ifindex; return dev; } return NULL; } static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; return vlan_seq_from_index(seq, pos); } static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return vlan_seq_from_index(seq, pos); } static void vlan_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { rcu_read_unlock(); } static int vlan_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); struct vlan_net *vn = net_generic(net, vlan_net_id); if (v == SEQ_START_TOKEN) { const char *nmtype = NULL; seq_puts(seq, "VLAN Dev name | VLAN ID\n"); if (vn->name_type < ARRAY_SIZE(vlan_name_type_str)) nmtype = vlan_name_type_str[vn->name_type]; seq_printf(seq, "Name-Type: %s\n", nmtype ? nmtype : "UNKNOWN"); } else { const struct net_device *vlandev = v; const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); seq_printf(seq, "%-15s| %d | %s\n", vlandev->name, vlan->vlan_id, vlan->real_dev->name); } return 0; } static int vlandev_seq_show(struct seq_file *seq, void *offset) { struct net_device *vlandev = (struct net_device *) seq->private; const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats; static const char fmt64[] = "%30s %12llu\n"; int i; if (!is_vlan_dev(vlandev)) return 0; stats = dev_get_stats(vlandev, &temp); seq_printf(seq, "%s VID: %d REORDER_HDR: %i dev->priv_flags: %x\n", vlandev->name, vlan->vlan_id, (int)(vlan->flags & 1), (u32)vlandev->priv_flags); seq_printf(seq, fmt64, "total frames received", stats->rx_packets); seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes); seq_printf(seq, fmt64, "Broadcast/Multicast Rcvd", stats->multicast); seq_puts(seq, "\n"); seq_printf(seq, fmt64, "total frames transmitted", stats->tx_packets); seq_printf(seq, fmt64, "total bytes transmitted", stats->tx_bytes); seq_printf(seq, "Device: %s", vlan->real_dev->name); /* now show all PRIORITY mappings relating to this VLAN */ seq_printf(seq, "\nINGRESS priority mappings: " "0:%u 1:%u 2:%u 3:%u 4:%u 5:%u 6:%u 7:%u\n", vlan->ingress_priority_map[0], vlan->ingress_priority_map[1], vlan->ingress_priority_map[2], vlan->ingress_priority_map[3], vlan->ingress_priority_map[4], vlan->ingress_priority_map[5], vlan->ingress_priority_map[6], vlan->ingress_priority_map[7]); seq_printf(seq, " EGRESS priority mappings: "); for (i = 0; i < 16; i++) { const struct vlan_priority_tci_mapping *mp = vlan->egress_priority_map[i]; while (mp) { seq_printf(seq, "%u:%d ", mp->priority, ((mp->vlan_qos >> 13) & 0x7)); mp = mp->next; } } seq_puts(seq, "\n"); return 0; }
138 24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Checksumming functions for IPv6 * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Borrows very liberally from tcp.c and ip.c, see those * files for more names. */ /* * Fixes: * * Ralf Baechle : generic ipv6 checksum * <ralf@waldorf-gmbh.de> */ #ifndef _CHECKSUM_IPV6_H #define _CHECKSUM_IPV6_H #include <asm/types.h> #include <asm/byteorder.h> #include <net/ip.h> #include <asm/checksum.h> #include <linux/in6.h> #include <linux/tcp.h> #include <linux/ipv6.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum); #endif static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto) { return ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, proto, 0)); } static __inline__ __sum16 tcp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, __wsum base) { return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline void __tcp_v6_send_check(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct tcphdr *th = tcp_hdr(skb); th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } static inline void tcp_v6_gso_csum_prep(struct sk_buff *skb) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); ipv6h->payload_len = 0; th->check = ~tcp_v6_check(0, &ipv6h->saddr, &ipv6h->daddr, 0); } static inline __sum16 udp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, __wsum base) { return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len); int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto); #endif
5 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CEPH_DECODE_H #define __CEPH_DECODE_H #include <linux/err.h> #include <linux/bug.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/unaligned.h> #include <linux/ceph/types.h> /* * in all cases, * void **p pointer to position pointer * void *end pointer to end of buffer (last byte + 1) */ static inline u64 ceph_decode_64(void **p) { u64 v = get_unaligned_le64(*p); *p += sizeof(u64); return v; } static inline u32 ceph_decode_32(void **p) { u32 v = get_unaligned_le32(*p); *p += sizeof(u32); return v; } static inline u16 ceph_decode_16(void **p) { u16 v = get_unaligned_le16(*p); *p += sizeof(u16); return v; } static inline u8 ceph_decode_8(void **p) { u8 v = *(u8 *)*p; (*p)++; return v; } static inline void ceph_decode_copy(void **p, void *pv, size_t n) { memcpy(pv, *p, n); *p += n; } /* * bounds check input. */ static inline bool ceph_has_room(void **p, void *end, size_t n) { return end >= *p && n <= end - *p; } #define ceph_decode_need(p, end, n, bad) \ do { \ if (!likely(ceph_has_room(p, end, n))) \ goto bad; \ } while (0) #define ceph_decode_64_safe(p, end, v, bad) \ do { \ ceph_decode_need(p, end, sizeof(u64), bad); \ v = ceph_decode_64(p); \ } while (0) #define ceph_decode_32_safe(p, end, v, bad) \ do { \ ceph_decode_need(p, end, sizeof(u32), bad); \ v = ceph_decode_32(p); \ } while (0) #define ceph_decode_16_safe(p, end, v, bad) \ do { \ ceph_decode_need(p, end, sizeof(u16), bad); \ v = ceph_decode_16(p); \ } while (0) #define ceph_decode_8_safe(p, end, v, bad) \ do { \ ceph_decode_need(p, end, sizeof(u8), bad); \ v = ceph_decode_8(p); \ } while (0) #define ceph_decode_copy_safe(p, end, pv, n, bad) \ do { \ ceph_decode_need(p, end, n, bad); \ ceph_decode_copy(p, pv, n); \ } while (0) /* * Allocate a buffer big enough to hold the wire-encoded string, and * decode the string into it. The resulting string will always be * terminated with '\0'. If successful, *p will be advanced * past the decoded data. Also, if lenp is not a null pointer, the * length (not including the terminating '\0') will be recorded in * *lenp. Note that a zero-length string is a valid return value. * * Returns a pointer to the newly-allocated string buffer, or a * pointer-coded errno if an error occurs. Neither *p nor *lenp * will have been updated if an error is returned. * * There are two possible failures: * - converting the string would require accessing memory at or * beyond the "end" pointer provided (-ERANGE) * - memory could not be allocated for the result (-ENOMEM) */ static inline char *ceph_extract_encoded_string(void **p, void *end, size_t *lenp, gfp_t gfp) { u32 len; void *sp = *p; char *buf; ceph_decode_32_safe(&sp, end, len, bad); if (!ceph_has_room(&sp, end, len)) goto bad; buf = kmalloc(len + 1, gfp); if (!buf) return ERR_PTR(-ENOMEM); if (len) memcpy(buf, sp, len); buf[len] = '\0'; *p = (char *) *p + sizeof (u32) + len; if (lenp) *lenp = (size_t) len; return buf; bad: return ERR_PTR(-ERANGE); } /* * skip helpers */ #define ceph_decode_skip_n(p, end, n, bad) \ do { \ ceph_decode_need(p, end, n, bad); \ *p += n; \ } while (0) #define ceph_decode_skip_64(p, end, bad) \ ceph_decode_skip_n(p, end, sizeof(u64), bad) #define ceph_decode_skip_32(p, end, bad) \ ceph_decode_skip_n(p, end, sizeof(u32), bad) #define ceph_decode_skip_16(p, end, bad) \ ceph_decode_skip_n(p, end, sizeof(u16), bad) #define ceph_decode_skip_8(p, end, bad) \ ceph_decode_skip_n(p, end, sizeof(u8), bad) #define ceph_decode_skip_string(p, end, bad) \ do { \ u32 len; \ \ ceph_decode_32_safe(p, end, len, bad); \ ceph_decode_skip_n(p, end, len, bad); \ } while (0) #define ceph_decode_skip_set(p, end, type, bad) \ do { \ u32 len; \ \ ceph_decode_32_safe(p, end, len, bad); \ while (len--) \ ceph_decode_skip_##type(p, end, bad); \ } while (0) #define ceph_decode_skip_map(p, end, ktype, vtype, bad) \ do { \ u32 len; \ \ ceph_decode_32_safe(p, end, len, bad); \ while (len--) { \ ceph_decode_skip_##ktype(p, end, bad); \ ceph_decode_skip_##vtype(p, end, bad); \ } \ } while (0) #define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \ do { \ u32 len; \ \ ceph_decode_32_safe(p, end, len, bad); \ while (len--) { \ ceph_decode_skip_##ktype1(p, end, bad); \ ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \ } \ } while (0) /* * struct ceph_timespec <-> struct timespec64 */ static inline void ceph_decode_timespec64(struct timespec64 *ts, const struct ceph_timespec *tv) { /* * This will still overflow in year 2106. We could extend * the protocol to steal two more bits from tv_nsec to * add three more 136 year epochs after that the way ext4 * does if necessary. */ ts->tv_sec = (time64_t)le32_to_cpu(tv->tv_sec); ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec); } static inline void ceph_encode_timespec64(struct ceph_timespec *tv, const struct timespec64 *ts) { tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); } /* * sockaddr_storage <-> ceph_sockaddr */ #define CEPH_ENTITY_ADDR_TYPE_NONE 0 #define CEPH_ENTITY_ADDR_TYPE_LEGACY __cpu_to_le32(1) #define CEPH_ENTITY_ADDR_TYPE_MSGR2 __cpu_to_le32(2) #define CEPH_ENTITY_ADDR_TYPE_ANY __cpu_to_le32(3) static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a) { __be16 ss_family = htons(a->in_addr.ss_family); a->in_addr.ss_family = *(__u16 *)&ss_family; /* Banner addresses require TYPE_NONE */ a->type = CEPH_ENTITY_ADDR_TYPE_NONE; } static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a) { __be16 ss_family = *(__be16 *)&a->in_addr.ss_family; a->in_addr.ss_family = ntohs(ss_family); WARN_ON(a->in_addr.ss_family == 512); a->type = CEPH_ENTITY_ADDR_TYPE_LEGACY; } extern int ceph_decode_entity_addr(void **p, void *end, struct ceph_entity_addr *addr); int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2, struct ceph_entity_addr *addr); int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr); void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr); /* * encoders */ static inline void ceph_encode_64(void **p, u64 v) { put_unaligned_le64(v, (__le64 *)*p); *p += sizeof(u64); } static inline void ceph_encode_32(void **p, u32 v) { put_unaligned_le32(v, (__le32 *)*p); *p += sizeof(u32); } static inline void ceph_encode_16(void **p, u16 v) { put_unaligned_le16(v, (__le16 *)*p); *p += sizeof(u16); } static inline void ceph_encode_8(void **p, u8 v) { *(u8 *)*p = v; (*p)++; } static inline void ceph_encode_copy(void **p, const void *s, int len) { memcpy(*p, s, len); *p += len; } /* * filepath, string encoders */ static inline void ceph_encode_filepath(void **p, void *end, u64 ino, const char *path) { u32 len = path ? strlen(path) : 0; BUG_ON(*p + 1 + sizeof(ino) + sizeof(len) + len > end); ceph_encode_8(p, 1); ceph_encode_64(p, ino); ceph_encode_32(p, len); if (len) memcpy(*p, path, len); *p += len; } static inline void ceph_encode_string(void **p, void *end, const char *s, u32 len) { BUG_ON(*p + sizeof(len) + len > end); ceph_encode_32(p, len); if (len) memcpy(*p, s, len); *p += len; } /* * version and length starting block encoders/decoders */ /* current code version (u8) + compat code version (u8) + len of struct (u32) */ #define CEPH_ENCODING_START_BLK_LEN 6 /** * ceph_start_encoding - start encoding block * @struct_v: current (code) version of the encoding * @struct_compat: oldest code version that can decode it * @struct_len: length of struct encoding */ static inline void ceph_start_encoding(void **p, u8 struct_v, u8 struct_compat, u32 struct_len) { ceph_encode_8(p, struct_v); ceph_encode_8(p, struct_compat); ceph_encode_32(p, struct_len); } /** * ceph_start_decoding - start decoding block * @v: current version of the encoding that the code supports * @name: name of the struct (free-form) * @struct_v: out param for the encoding version * @struct_len: out param for the length of struct encoding * * Validates the length of struct encoding, so unsafe ceph_decode_* * variants can be used for decoding. */ static inline int ceph_start_decoding(void **p, void *end, u8 v, const char *name, u8 *struct_v, u32 *struct_len) { u8 struct_compat; ceph_decode_need(p, end, CEPH_ENCODING_START_BLK_LEN, bad); *struct_v = ceph_decode_8(p); struct_compat = ceph_decode_8(p); if (v < struct_compat) { pr_warn("got struct_v %d struct_compat %d > %d of %s\n", *struct_v, struct_compat, v, name); return -EINVAL; } *struct_len = ceph_decode_32(p); ceph_decode_need(p, end, *struct_len, bad); return 0; bad: return -ERANGE; } #define ceph_encode_need(p, end, n, bad) \ do { \ if (!likely(ceph_has_room(p, end, n))) \ goto bad; \ } while (0) #define ceph_encode_64_safe(p, end, v, bad) \ do { \ ceph_encode_need(p, end, sizeof(u64), bad); \ ceph_encode_64(p, v); \ } while (0) #define ceph_encode_32_safe(p, end, v, bad) \ do { \ ceph_encode_need(p, end, sizeof(u32), bad); \ ceph_encode_32(p, v); \ } while (0) #define ceph_encode_16_safe(p, end, v, bad) \ do { \ ceph_encode_need(p, end, sizeof(u16), bad); \ ceph_encode_16(p, v); \ } while (0) #define ceph_encode_8_safe(p, end, v, bad) \ do { \ ceph_encode_need(p, end, sizeof(u8), bad); \ ceph_encode_8(p, v); \ } while (0) #define ceph_encode_copy_safe(p, end, pv, n, bad) \ do { \ ceph_encode_need(p, end, n, bad); \ ceph_encode_copy(p, pv, n); \ } while (0) #define ceph_encode_string_safe(p, end, s, n, bad) \ do { \ ceph_encode_need(p, end, n, bad); \ ceph_encode_string(p, end, s, n); \ } while (0) #endif
2 2 2 2 2 2 2 2 2 5 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 8 19 3 3 3 3 3 3 19 19 19 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 19 19 19 18 2 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 // SPDX-License-Identifier: GPL-2.0-or-later /* */ #include <linux/init.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/usb/audio.h> #include <linux/usb/midi.h> #include <linux/bits.h> #include <sound/control.h> #include <sound/core.h> #include <sound/info.h> #include <sound/pcm.h> #include "usbaudio.h" #include "card.h" #include "mixer.h" #include "mixer_quirks.h" #include "midi.h" #include "midi2.h" #include "quirks.h" #include "helper.h" #include "endpoint.h" #include "pcm.h" #include "clock.h" #include "stream.h" /* * handle the quirks for the contained interfaces */ static int create_composite_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk_comp) { int probed_ifnum = get_iface_desc(iface->altsetting)->bInterfaceNumber; const struct snd_usb_audio_quirk *quirk; int err; for (quirk = quirk_comp->data; quirk->ifnum >= 0; ++quirk) { iface = usb_ifnum_to_if(chip->dev, quirk->ifnum); if (!iface) continue; if (quirk->ifnum != probed_ifnum && usb_interface_claimed(iface)) continue; err = snd_usb_create_quirk(chip, iface, driver, quirk); if (err < 0) return err; } for (quirk = quirk_comp->data; quirk->ifnum >= 0; ++quirk) { iface = usb_ifnum_to_if(chip->dev, quirk->ifnum); if (!iface) continue; if (quirk->ifnum != probed_ifnum && !usb_interface_claimed(iface)) { err = usb_driver_claim_interface(driver, iface, USB_AUDIO_IFACE_UNUSED); if (err < 0) return err; } } return 0; } static int ignore_interface_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk) { return 0; } static int create_any_midi_quirk(struct snd_usb_audio *chip, struct usb_interface *intf, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk) { return snd_usb_midi_v2_create(chip, intf, quirk, 0); } /* * create a stream for an interface with proper descriptors */ static int create_standard_audio_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk) { struct usb_host_interface *alts; struct usb_interface_descriptor *altsd; int err; alts = &iface->altsetting[0]; altsd = get_iface_desc(alts); err = snd_usb_parse_audio_interface(chip, altsd->bInterfaceNumber); if (err < 0) { usb_audio_err(chip, "cannot setup if %d: error %d\n", altsd->bInterfaceNumber, err); return err; } /* reset the current interface */ usb_set_interface(chip->dev, altsd->bInterfaceNumber, 0); return 0; } /* create the audio stream and the corresponding endpoints from the fixed * audioformat object; this is used for quirks with the fixed EPs */ static int add_audio_stream_from_fixed_fmt(struct snd_usb_audio *chip, struct audioformat *fp) { int stream, err; stream = (fp->endpoint & USB_DIR_IN) ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK; snd_usb_audioformat_set_sync_ep(chip, fp); err = snd_usb_add_audio_stream(chip, stream, fp); if (err < 0) return err; err = snd_usb_add_endpoint(chip, fp->endpoint, SND_USB_ENDPOINT_TYPE_DATA); if (err < 0) return err; if (fp->sync_ep) { err = snd_usb_add_endpoint(chip, fp->sync_ep, fp->implicit_fb ? SND_USB_ENDPOINT_TYPE_DATA : SND_USB_ENDPOINT_TYPE_SYNC); if (err < 0) return err; } return 0; } /* * create a stream for an endpoint/altsetting without proper descriptors */ static int create_fixed_stream_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk) { struct audioformat *fp; struct usb_host_interface *alts; struct usb_interface_descriptor *altsd; unsigned *rate_table = NULL; int err; fp = kmemdup(quirk->data, sizeof(*fp), GFP_KERNEL); if (!fp) return -ENOMEM; INIT_LIST_HEAD(&fp->list); if (fp->nr_rates > MAX_NR_RATES) { kfree(fp); return -EINVAL; } if (fp->nr_rates > 0) { rate_table = kmemdup_array(fp->rate_table, fp->nr_rates, sizeof(int), GFP_KERNEL); if (!rate_table) { kfree(fp); return -ENOMEM; } fp->rate_table = rate_table; } if (fp->iface != get_iface_desc(&iface->altsetting[0])->bInterfaceNumber || fp->altset_idx >= iface->num_altsetting) { err = -EINVAL; goto error; } alts = &iface->altsetting[fp->altset_idx]; altsd = get_iface_desc(alts); if (altsd->bNumEndpoints <= fp->ep_idx) { err = -EINVAL; goto error; } fp->protocol = altsd->bInterfaceProtocol; if (fp->datainterval == 0) fp->datainterval = snd_usb_parse_datainterval(chip, alts); if (fp->maxpacksize == 0) fp->maxpacksize = le16_to_cpu(get_endpoint(alts, fp->ep_idx)->wMaxPacketSize); if (!fp->fmt_type) fp->fmt_type = UAC_FORMAT_TYPE_I; err = add_audio_stream_from_fixed_fmt(chip, fp); if (err < 0) goto error; usb_set_interface(chip->dev, fp->iface, 0); snd_usb_init_pitch(chip, fp); snd_usb_init_sample_rate(chip, fp, fp->rate_max); return 0; error: list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp); kfree(rate_table); return err; } static int create_auto_pcm_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver) { struct usb_host_interface *alts; struct usb_interface_descriptor *altsd; struct usb_endpoint_descriptor *epd; struct uac1_as_header_descriptor *ashd; struct uac_format_type_i_discrete_descriptor *fmtd; /* * Most Roland/Yamaha audio streaming interfaces have more or less * standard descriptors, but older devices might lack descriptors, and * future ones might change, so ensure that we fail silently if the * interface doesn't look exactly right. */ /* must have a non-zero altsetting for streaming */ if (iface->num_altsetting < 2) return -ENODEV; alts = &iface->altsetting[1]; altsd = get_iface_desc(alts); /* must have an isochronous endpoint for streaming */ if (altsd->bNumEndpoints < 1) return -ENODEV; epd = get_endpoint(alts, 0); if (!usb_endpoint_xfer_isoc(epd)) return -ENODEV; /* must have format descriptors */ ashd = snd_usb_find_csint_desc(alts->extra, alts->extralen, NULL, UAC_AS_GENERAL); fmtd = snd_usb_find_csint_desc(alts->extra, alts->extralen, NULL, UAC_FORMAT_TYPE); if (!ashd || ashd->bLength < 7 || !fmtd || fmtd->bLength < 8) return -ENODEV; return create_standard_audio_quirk(chip, iface, driver, NULL); } static int create_yamaha_midi_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, struct usb_host_interface *alts) { static const struct snd_usb_audio_quirk yamaha_midi_quirk = { .type = QUIRK_MIDI_YAMAHA }; struct usb_midi_in_jack_descriptor *injd; struct usb_midi_out_jack_descriptor *outjd; /* must have some valid jack descriptors */ injd = snd_usb_find_csint_desc(alts->extra, alts->extralen, NULL, USB_MS_MIDI_IN_JACK); outjd = snd_usb_find_csint_desc(alts->extra, alts->extralen, NULL, USB_MS_MIDI_OUT_JACK); if (!injd && !outjd) return -ENODEV; if ((injd && !snd_usb_validate_midi_desc(injd)) || (outjd && !snd_usb_validate_midi_desc(outjd))) return -ENODEV; if (injd && (injd->bLength < 5 || (injd->bJackType != USB_MS_EMBEDDED && injd->bJackType != USB_MS_EXTERNAL))) return -ENODEV; if (outjd && (outjd->bLength < 6 || (outjd->bJackType != USB_MS_EMBEDDED && outjd->bJackType != USB_MS_EXTERNAL))) return -ENODEV; return create_any_midi_quirk(chip, iface, driver, &yamaha_midi_quirk); } static int create_roland_midi_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, struct usb_host_interface *alts) { static const struct snd_usb_audio_quirk roland_midi_quirk = { .type = QUIRK_MIDI_ROLAND }; u8 *roland_desc = NULL; /* might have a vendor-specific descriptor <06 24 F1 02 ...> */ for (;;) { roland_desc = snd_usb_find_csint_desc(alts->extra, alts->extralen, roland_desc, 0xf1); if (!roland_desc) return -ENODEV; if (roland_desc[0] < 6 || roland_desc[3] != 2) continue; return create_any_midi_quirk(chip, iface, driver, &roland_midi_quirk); } } static int create_std_midi_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, struct usb_host_interface *alts) { struct usb_ms_header_descriptor *mshd; struct usb_ms_endpoint_descriptor *msepd; /* must have the MIDIStreaming interface header descriptor*/ mshd = (struct usb_ms_header_descriptor *)alts->extra; if (alts->extralen < 7 || mshd->bLength < 7 || mshd->bDescriptorType != USB_DT_CS_INTERFACE || mshd->bDescriptorSubtype != USB_MS_HEADER) return -ENODEV; /* must have the MIDIStreaming endpoint descriptor*/ msepd = (struct usb_ms_endpoint_descriptor *)alts->endpoint[0].extra; if (alts->endpoint[0].extralen < 4 || msepd->bLength < 4 || msepd->bDescriptorType != USB_DT_CS_ENDPOINT || msepd->bDescriptorSubtype != UAC_MS_GENERAL || msepd->bNumEmbMIDIJack < 1 || msepd->bNumEmbMIDIJack > 16) return -ENODEV; return create_any_midi_quirk(chip, iface, driver, NULL); } static int create_auto_midi_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver) { struct usb_host_interface *alts; struct usb_interface_descriptor *altsd; struct usb_endpoint_descriptor *epd; int err; alts = &iface->altsetting[0]; altsd = get_iface_desc(alts); /* must have at least one bulk/interrupt endpoint for streaming */ if (altsd->bNumEndpoints < 1) return -ENODEV; epd = get_endpoint(alts, 0); if (!usb_endpoint_xfer_bulk(epd) && !usb_endpoint_xfer_int(epd)) return -ENODEV; switch (USB_ID_VENDOR(chip->usb_id)) { case 0x0499: /* Yamaha */ err = create_yamaha_midi_quirk(chip, iface, driver, alts); if (err != -ENODEV) return err; break; case 0x0582: /* Roland */ err = create_roland_midi_quirk(chip, iface, driver, alts); if (err != -ENODEV) return err; break; } return create_std_midi_quirk(chip, iface, driver, alts); } static int create_autodetect_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk) { int err; err = create_auto_pcm_quirk(chip, iface, driver); if (err == -ENODEV) err = create_auto_midi_quirk(chip, iface, driver); return err; } /* * Create a stream for an Edirol UA-700/UA-25/UA-4FX interface. * The only way to detect the sample rate is by looking at wMaxPacketSize. */ static int create_uaxx_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk) { static const struct audioformat ua_format = { .formats = SNDRV_PCM_FMTBIT_S24_3LE, .channels = 2, .fmt_type = UAC_FORMAT_TYPE_I, .altsetting = 1, .altset_idx = 1, .rates = SNDRV_PCM_RATE_CONTINUOUS, }; struct usb_host_interface *alts; struct usb_interface_descriptor *altsd; struct audioformat *fp; int err; /* both PCM and MIDI interfaces have 2 or more altsettings */ if (iface->num_altsetting < 2) return -ENXIO; alts = &iface->altsetting[1]; altsd = get_iface_desc(alts); if (altsd->bNumEndpoints == 2) { static const struct snd_usb_midi_endpoint_info ua700_ep = { .out_cables = 0x0003, .in_cables = 0x0003 }; static const struct snd_usb_audio_quirk ua700_quirk = { .type = QUIRK_MIDI_FIXED_ENDPOINT, .data = &ua700_ep }; static const struct snd_usb_midi_endpoint_info uaxx_ep = { .out_cables = 0x0001, .in_cables = 0x0001 }; static const struct snd_usb_audio_quirk uaxx_quirk = { .type = QUIRK_MIDI_FIXED_ENDPOINT, .data = &uaxx_ep }; const struct snd_usb_audio_quirk *quirk = chip->usb_id == USB_ID(0x0582, 0x002b) ? &ua700_quirk : &uaxx_quirk; return __snd_usbmidi_create(chip->card, iface, &chip->midi_list, quirk, chip->usb_id, &chip->num_rawmidis); } if (altsd->bNumEndpoints != 1) return -ENXIO; fp = kmemdup(&ua_format, sizeof(*fp), GFP_KERNEL); if (!fp) return -ENOMEM; fp->iface = altsd->bInterfaceNumber; fp->endpoint = get_endpoint(alts, 0)->bEndpointAddress; fp->ep_attr = get_endpoint(alts, 0)->bmAttributes; fp->datainterval = 0; fp->maxpacksize = le16_to_cpu(get_endpoint(alts, 0)->wMaxPacketSize); INIT_LIST_HEAD(&fp->list); switch (fp->maxpacksize) { case 0x120: fp->rate_max = fp->rate_min = 44100; break; case 0x138: case 0x140: fp->rate_max = fp->rate_min = 48000; break; case 0x258: case 0x260: fp->rate_max = fp->rate_min = 96000; break; default: usb_audio_err(chip, "unknown sample rate\n"); kfree(fp); return -ENXIO; } err = add_audio_stream_from_fixed_fmt(chip, fp); if (err < 0) { list_del(&fp->list); /* unlink for avoiding double-free */ kfree(fp); return err; } usb_set_interface(chip->dev, fp->iface, 0); return 0; } /* * Create a standard mixer for the specified interface. */ static int create_standard_mixer_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk) { if (quirk->ifnum < 0) return 0; return snd_usb_create_mixer(chip, quirk->ifnum); } /* * audio-interface quirks * * returns zero if no standard audio/MIDI parsing is needed. * returns a positive value if standard audio/midi interfaces are parsed * after this. * returns a negative value at error. */ int snd_usb_create_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk) { typedef int (*quirk_func_t)(struct snd_usb_audio *, struct usb_interface *, struct usb_driver *, const struct snd_usb_audio_quirk *); static const quirk_func_t quirk_funcs[] = { [QUIRK_IGNORE_INTERFACE] = ignore_interface_quirk, [QUIRK_COMPOSITE] = create_composite_quirk, [QUIRK_AUTODETECT] = create_autodetect_quirk, [QUIRK_MIDI_STANDARD_INTERFACE] = create_any_midi_quirk, [QUIRK_MIDI_FIXED_ENDPOINT] = create_any_midi_quirk, [QUIRK_MIDI_YAMAHA] = create_any_midi_quirk, [QUIRK_MIDI_ROLAND] = create_any_midi_quirk, [QUIRK_MIDI_MIDIMAN] = create_any_midi_quirk, [QUIRK_MIDI_NOVATION] = create_any_midi_quirk, [QUIRK_MIDI_RAW_BYTES] = create_any_midi_quirk, [QUIRK_MIDI_EMAGIC] = create_any_midi_quirk, [QUIRK_MIDI_CME] = create_any_midi_quirk, [QUIRK_MIDI_AKAI] = create_any_midi_quirk, [QUIRK_MIDI_FTDI] = create_any_midi_quirk, [QUIRK_MIDI_CH345] = create_any_midi_quirk, [QUIRK_AUDIO_STANDARD_INTERFACE] = create_standard_audio_quirk, [QUIRK_AUDIO_FIXED_ENDPOINT] = create_fixed_stream_quirk, [QUIRK_AUDIO_EDIROL_UAXX] = create_uaxx_quirk, [QUIRK_AUDIO_STANDARD_MIXER] = create_standard_mixer_quirk, }; if (quirk->type < QUIRK_TYPE_COUNT) { return quirk_funcs[quirk->type](chip, iface, driver, quirk); } else { usb_audio_err(chip, "invalid quirk type %d\n", quirk->type); return -ENXIO; } } /* * boot quirks */ #define EXTIGY_FIRMWARE_SIZE_OLD 794 #define EXTIGY_FIRMWARE_SIZE_NEW 483 static int snd_usb_extigy_boot_quirk(struct usb_device *dev, struct usb_interface *intf) { struct usb_host_config *config = dev->actconfig; struct usb_device_descriptor *new_device_descriptor __free(kfree) = NULL; int err; if (le16_to_cpu(get_cfg_desc(config)->wTotalLength) == EXTIGY_FIRMWARE_SIZE_OLD || le16_to_cpu(get_cfg_desc(config)->wTotalLength) == EXTIGY_FIRMWARE_SIZE_NEW) { dev_dbg(&dev->dev, "sending Extigy boot sequence...\n"); /* Send message to force it to reconnect with full interface. */ err = snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev,0), 0x10, 0x43, 0x0001, 0x000a, NULL, 0); if (err < 0) dev_dbg(&dev->dev, "error sending boot message: %d\n", err); new_device_descriptor = kmalloc(sizeof(*new_device_descriptor), GFP_KERNEL); if (!new_device_descriptor) return -ENOMEM; err = usb_get_descriptor(dev, USB_DT_DEVICE, 0, new_device_descriptor, sizeof(*new_device_descriptor)); if (err < 0) dev_dbg(&dev->dev, "error usb_get_descriptor: %d\n", err); if (new_device_descriptor->bNumConfigurations > dev->descriptor.bNumConfigurations) dev_dbg(&dev->dev, "error too large bNumConfigurations: %d\n", new_device_descriptor->bNumConfigurations); else memcpy(&dev->descriptor, new_device_descriptor, sizeof(dev->descriptor)); err = usb_reset_configuration(dev); if (err < 0) dev_dbg(&dev->dev, "error usb_reset_configuration: %d\n", err); dev_dbg(&dev->dev, "extigy_boot: new boot length = %d\n", le16_to_cpu(get_cfg_desc(config)->wTotalLength)); return -ENODEV; /* quit this anyway */ } return 0; } static int snd_usb_audigy2nx_boot_quirk(struct usb_device *dev) { u8 buf = 1; snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), 0x2a, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_OTHER, 0, 0, &buf, 1); if (buf == 0) { snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 0x29, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_OTHER, 1, 2000, NULL, 0); return -ENODEV; } return 0; } static int snd_usb_fasttrackpro_boot_quirk(struct usb_device *dev) { int err; if (dev->actconfig->desc.bConfigurationValue == 1) { dev_info(&dev->dev, "Fast Track Pro switching to config #2\n"); /* This function has to be available by the usb core module. * if it is not avialable the boot quirk has to be left out * and the configuration has to be set by udev or hotplug * rules */ err = usb_driver_set_configuration(dev, 2); if (err < 0) dev_dbg(&dev->dev, "error usb_driver_set_configuration: %d\n", err); /* Always return an error, so that we stop creating a device that will just be destroyed and recreated with a new configuration */ return -ENODEV; } else dev_info(&dev->dev, "Fast Track Pro config OK\n"); return 0; } /* * C-Media CM106/CM106+ have four 16-bit internal registers that are nicely * documented in the device's data sheet. */ static int snd_usb_cm106_write_int_reg(struct usb_device *dev, int reg, u16 value) { u8 buf[4]; buf[0] = 0x20; buf[1] = value & 0xff; buf[2] = (value >> 8) & 0xff; buf[3] = reg; return snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), USB_REQ_SET_CONFIGURATION, USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_ENDPOINT, 0, 0, &buf, 4); } static int snd_usb_cm106_boot_quirk(struct usb_device *dev) { /* * Enable line-out driver mode, set headphone source to front * channels, enable stereo mic. */ return snd_usb_cm106_write_int_reg(dev, 2, 0x8004); } /* * CM6206 registers from the CM6206 datasheet rev 2.1 */ #define CM6206_REG0_DMA_MASTER BIT(15) #define CM6206_REG0_SPDIFO_RATE_48K (2 << 12) #define CM6206_REG0_SPDIFO_RATE_96K (7 << 12) /* Bit 4 thru 11 is the S/PDIF category code */ #define CM6206_REG0_SPDIFO_CAT_CODE_GENERAL (0 << 4) #define CM6206_REG0_SPDIFO_EMPHASIS_CD BIT(3) #define CM6206_REG0_SPDIFO_COPYRIGHT_NA BIT(2) #define CM6206_REG0_SPDIFO_NON_AUDIO BIT(1) #define CM6206_REG0_SPDIFO_PRO_FORMAT BIT(0) #define CM6206_REG1_TEST_SEL_CLK BIT(14) #define CM6206_REG1_PLLBIN_EN BIT(13) #define CM6206_REG1_SOFT_MUTE_EN BIT(12) #define CM6206_REG1_GPIO4_OUT BIT(11) #define CM6206_REG1_GPIO4_OE BIT(10) #define CM6206_REG1_GPIO3_OUT BIT(9) #define CM6206_REG1_GPIO3_OE BIT(8) #define CM6206_REG1_GPIO2_OUT BIT(7) #define CM6206_REG1_GPIO2_OE BIT(6) #define CM6206_REG1_GPIO1_OUT BIT(5) #define CM6206_REG1_GPIO1_OE BIT(4) #define CM6206_REG1_SPDIFO_INVALID BIT(3) #define CM6206_REG1_SPDIF_LOOP_EN BIT(2) #define CM6206_REG1_SPDIFO_DIS BIT(1) #define CM6206_REG1_SPDIFI_MIX BIT(0) #define CM6206_REG2_DRIVER_ON BIT(15) #define CM6206_REG2_HEADP_SEL_SIDE_CHANNELS (0 << 13) #define CM6206_REG2_HEADP_SEL_SURROUND_CHANNELS (1 << 13) #define CM6206_REG2_HEADP_SEL_CENTER_SUBW (2 << 13) #define CM6206_REG2_HEADP_SEL_FRONT_CHANNELS (3 << 13) #define CM6206_REG2_MUTE_HEADPHONE_RIGHT BIT(12) #define CM6206_REG2_MUTE_HEADPHONE_LEFT BIT(11) #define CM6206_REG2_MUTE_REAR_SURROUND_RIGHT BIT(10) #define CM6206_REG2_MUTE_REAR_SURROUND_LEFT BIT(9) #define CM6206_REG2_MUTE_SIDE_SURROUND_RIGHT BIT(8) #define CM6206_REG2_MUTE_SIDE_SURROUND_LEFT BIT(7) #define CM6206_REG2_MUTE_SUBWOOFER BIT(6) #define CM6206_REG2_MUTE_CENTER BIT(5) #define CM6206_REG2_MUTE_RIGHT_FRONT BIT(3) #define CM6206_REG2_MUTE_LEFT_FRONT BIT(3) #define CM6206_REG2_EN_BTL BIT(2) #define CM6206_REG2_MCUCLKSEL_1_5_MHZ (0) #define CM6206_REG2_MCUCLKSEL_3_MHZ (1) #define CM6206_REG2_MCUCLKSEL_6_MHZ (2) #define CM6206_REG2_MCUCLKSEL_12_MHZ (3) /* Bit 11..13 sets the sensitivity to FLY tuner volume control VP/VD signal */ #define CM6206_REG3_FLYSPEED_DEFAULT (2 << 11) #define CM6206_REG3_VRAP25EN BIT(10) #define CM6206_REG3_MSEL1 BIT(9) #define CM6206_REG3_SPDIFI_RATE_44_1K BIT(0 << 7) #define CM6206_REG3_SPDIFI_RATE_48K BIT(2 << 7) #define CM6206_REG3_SPDIFI_RATE_32K BIT(3 << 7) #define CM6206_REG3_PINSEL BIT(6) #define CM6206_REG3_FOE BIT(5) #define CM6206_REG3_ROE BIT(4) #define CM6206_REG3_CBOE BIT(3) #define CM6206_REG3_LOSE BIT(2) #define CM6206_REG3_HPOE BIT(1) #define CM6206_REG3_SPDIFI_CANREC BIT(0) #define CM6206_REG5_DA_RSTN BIT(13) #define CM6206_REG5_AD_RSTN BIT(12) #define CM6206_REG5_SPDIFO_AD2SPDO BIT(12) #define CM6206_REG5_SPDIFO_SEL_FRONT (0 << 9) #define CM6206_REG5_SPDIFO_SEL_SIDE_SUR (1 << 9) #define CM6206_REG5_SPDIFO_SEL_CEN_LFE (2 << 9) #define CM6206_REG5_SPDIFO_SEL_REAR_SUR (3 << 9) #define CM6206_REG5_CODECM BIT(8) #define CM6206_REG5_EN_HPF BIT(7) #define CM6206_REG5_T_SEL_DSDA4 BIT(6) #define CM6206_REG5_T_SEL_DSDA3 BIT(5) #define CM6206_REG5_T_SEL_DSDA2 BIT(4) #define CM6206_REG5_T_SEL_DSDA1 BIT(3) #define CM6206_REG5_T_SEL_DSDAD_NORMAL 0 #define CM6206_REG5_T_SEL_DSDAD_FRONT 4 #define CM6206_REG5_T_SEL_DSDAD_S_SURROUND 5 #define CM6206_REG5_T_SEL_DSDAD_CEN_LFE 6 #define CM6206_REG5_T_SEL_DSDAD_R_SURROUND 7 static int snd_usb_cm6206_boot_quirk(struct usb_device *dev) { int err = 0, reg; int val[] = { /* * Values here are chosen based on sniffing USB traffic * under Windows. * * REG0: DAC is master, sample rate 48kHz, no copyright */ CM6206_REG0_SPDIFO_RATE_48K | CM6206_REG0_SPDIFO_COPYRIGHT_NA, /* * REG1: PLL binary search enable, soft mute enable. */ CM6206_REG1_PLLBIN_EN | CM6206_REG1_SOFT_MUTE_EN, /* * REG2: enable output drivers, * select front channels to the headphone output, * then mute the headphone channels, run the MCU * at 1.5 MHz. */ CM6206_REG2_DRIVER_ON | CM6206_REG2_HEADP_SEL_FRONT_CHANNELS | CM6206_REG2_MUTE_HEADPHONE_RIGHT | CM6206_REG2_MUTE_HEADPHONE_LEFT, /* * REG3: default flyspeed, set 2.5V mic bias * enable all line out ports and enable SPDIF */ CM6206_REG3_FLYSPEED_DEFAULT | CM6206_REG3_VRAP25EN | CM6206_REG3_FOE | CM6206_REG3_ROE | CM6206_REG3_CBOE | CM6206_REG3_LOSE | CM6206_REG3_HPOE | CM6206_REG3_SPDIFI_CANREC, /* REG4 is just a bunch of GPIO lines */ 0x0000, /* REG5: de-assert AD/DA reset signals */ CM6206_REG5_DA_RSTN | CM6206_REG5_AD_RSTN }; for (reg = 0; reg < ARRAY_SIZE(val); reg++) { err = snd_usb_cm106_write_int_reg(dev, reg, val[reg]); if (err < 0) return err; } return err; } /* quirk for Plantronics GameCom 780 with CM6302 chip */ static int snd_usb_gamecon780_boot_quirk(struct usb_device *dev) { /* set the initial volume and don't change; other values are either * too loud or silent due to firmware bug (bko#65251) */ u8 buf[2] = { 0x74, 0xe3 }; return snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), UAC_SET_CUR, USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_OUT, UAC_FU_VOLUME << 8, 9 << 8, buf, 2); } /* * Novation Twitch DJ controller * Focusrite Novation Saffire 6 USB audio card */ static int snd_usb_novation_boot_quirk(struct usb_device *dev) { /* preemptively set up the device because otherwise the * raw MIDI endpoints are not active */ usb_set_interface(dev, 0, 1); return 0; } /* * This call will put the synth in "USB send" mode, i.e it will send MIDI * messages through USB (this is disabled at startup). The synth will * acknowledge by sending a sysex on endpoint 0x85 and by displaying a USB * sign on its LCD. Values here are chosen based on sniffing USB traffic * under Windows. */ static int snd_usb_accessmusic_boot_quirk(struct usb_device *dev) { int err, actual_length; /* "midi send" enable */ static const u8 seq[] = { 0x4e, 0x73, 0x52, 0x01 }; void *buf; if (usb_pipe_type_check(dev, usb_sndintpipe(dev, 0x05))) return -EINVAL; buf = kmemdup(seq, ARRAY_SIZE(seq), GFP_KERNEL); if (!buf) return -ENOMEM; err = usb_interrupt_msg(dev, usb_sndintpipe(dev, 0x05), buf, ARRAY_SIZE(seq), &actual_length, 1000); kfree(buf); if (err < 0) return err; return 0; } /* * Some sound cards from Native Instruments are in fact compliant to the USB * audio standard of version 2 and other approved USB standards, even though * they come up as vendor-specific device when first connected. * * However, they can be told to come up with a new set of descriptors * upon their next enumeration, and the interfaces announced by the new * descriptors will then be handled by the kernel's class drivers. As the * product ID will also change, no further checks are required. */ static int snd_usb_nativeinstruments_boot_quirk(struct usb_device *dev) { int ret; ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0xaf, USB_TYPE_VENDOR | USB_RECIP_DEVICE, 1, 0, NULL, 0, 1000); if (ret < 0) return ret; usb_reset_device(dev); /* return -EAGAIN, so the creation of an audio interface for this * temporary device is aborted. The device will reconnect with a * new product ID */ return -EAGAIN; } static void mbox2_setup_48_24_magic(struct usb_device *dev) { u8 srate[3]; u8 temp[12]; /* Choose 48000Hz permanently */ srate[0] = 0x80; srate[1] = 0xbb; srate[2] = 0x00; /* Send the magic! */ snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), 0x01, 0x22, 0x0100, 0x0085, &temp, 0x0003); snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 0x81, 0xa2, 0x0100, 0x0085, &srate, 0x0003); snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 0x81, 0xa2, 0x0100, 0x0086, &srate, 0x0003); snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 0x81, 0xa2, 0x0100, 0x0003, &srate, 0x0003); return; } /* Digidesign Mbox 2 needs to load firmware onboard * and driver must wait a few seconds for initialisation. */ #define MBOX2_FIRMWARE_SIZE 646 #define MBOX2_BOOT_LOADING 0x01 /* Hard coded into the device */ #define MBOX2_BOOT_READY 0x02 /* Hard coded into the device */ static int snd_usb_mbox2_boot_quirk(struct usb_device *dev) { struct usb_host_config *config = dev->actconfig; struct usb_device_descriptor *new_device_descriptor __free(kfree) = NULL; int err; u8 bootresponse[0x12]; int fwsize; int count; fwsize = le16_to_cpu(get_cfg_desc(config)->wTotalLength); if (fwsize != MBOX2_FIRMWARE_SIZE) { dev_err(&dev->dev, "Invalid firmware size=%d.\n", fwsize); return -ENODEV; } dev_dbg(&dev->dev, "Sending Digidesign Mbox 2 boot sequence...\n"); count = 0; bootresponse[0] = MBOX2_BOOT_LOADING; while ((bootresponse[0] == MBOX2_BOOT_LOADING) && (count < 10)) { msleep(500); /* 0.5 second delay */ snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), /* Control magic - load onboard firmware */ 0x85, 0xc0, 0x0001, 0x0000, &bootresponse, 0x0012); if (bootresponse[0] == MBOX2_BOOT_READY) break; dev_dbg(&dev->dev, "device not ready, resending boot sequence...\n"); count++; } if (bootresponse[0] != MBOX2_BOOT_READY) { dev_err(&dev->dev, "Unknown bootresponse=%d, or timed out, ignoring device.\n", bootresponse[0]); return -ENODEV; } dev_dbg(&dev->dev, "device initialised!\n"); new_device_descriptor = kmalloc(sizeof(*new_device_descriptor), GFP_KERNEL); if (!new_device_descriptor) return -ENOMEM; err = usb_get_descriptor(dev, USB_DT_DEVICE, 0, new_device_descriptor, sizeof(*new_device_descriptor)); if (err < 0) dev_dbg(&dev->dev, "error usb_get_descriptor: %d\n", err); if (new_device_descriptor->bNumConfigurations > dev->descriptor.bNumConfigurations) dev_dbg(&dev->dev, "error too large bNumConfigurations: %d\n", new_device_descriptor->bNumConfigurations); else memcpy(&dev->descriptor, new_device_descriptor, sizeof(dev->descriptor)); err = usb_reset_configuration(dev); if (err < 0) dev_dbg(&dev->dev, "error usb_reset_configuration: %d\n", err); dev_dbg(&dev->dev, "mbox2_boot: new boot length = %d\n", le16_to_cpu(get_cfg_desc(config)->wTotalLength)); mbox2_setup_48_24_magic(dev); dev_info(&dev->dev, "Digidesign Mbox 2: 24bit 48kHz"); return 0; /* Successful boot */ } static int snd_usb_axefx3_boot_quirk(struct usb_device *dev) { int err; dev_dbg(&dev->dev, "Waiting for Axe-Fx III to boot up...\n"); /* If the Axe-Fx III has not fully booted, it will timeout when trying * to enable the audio streaming interface. A more generous timeout is * used here to detect when the Axe-Fx III has finished booting as the * set interface message will be acked once it has */ err = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), USB_REQ_SET_INTERFACE, USB_RECIP_INTERFACE, 1, 1, NULL, 0, 120000); if (err < 0) { dev_err(&dev->dev, "failed waiting for Axe-Fx III to boot: %d\n", err); return err; } dev_dbg(&dev->dev, "Axe-Fx III is now ready\n"); err = usb_set_interface(dev, 1, 0); if (err < 0) dev_dbg(&dev->dev, "error stopping Axe-Fx III interface: %d\n", err); return 0; } static void mbox3_setup_defaults(struct usb_device *dev) { /* The Mbox 3 is "little endian" */ /* max volume is: 0x0000. */ /* min volume is: 0x0080 (shown in little endian form) */ u8 com_buff[2]; /* Deactivate Tuner */ /* on = 0x01*/ /* off = 0x00*/ com_buff[0] = 0x00; snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 0x01, 0x21, 0x0003, 0x2001, &com_buff, 1); /* Set clock source to Internal (as opposed to S/PDIF) */ /* Internal = 0x01*/ /* S/PDIF = 0x02*/ com_buff[0] = 0x01; snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0100, 0x8001, &com_buff, 1); /* Mute the hardware loopbacks to start the device in a known state. */ com_buff[0] = 0x00; com_buff[1] = 0x80; /* Analogue input 1 left channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0110, 0x4001, &com_buff, 2); /* Analogue input 1 right channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0111, 0x4001, &com_buff, 2); /* Analogue input 2 left channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0114, 0x4001, &com_buff, 2); /* Analogue input 2 right channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0115, 0x4001, &com_buff, 2); /* Analogue input 3 left channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0118, 0x4001, &com_buff, 2); /* Analogue input 3 right channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0119, 0x4001, &com_buff, 2); /* Analogue input 4 left channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x011c, 0x4001, &com_buff, 2); /* Analogue input 4 right channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x011d, 0x4001, &com_buff, 2); /* Set software sends to output */ com_buff[0] = 0x00; com_buff[1] = 0x00; /* Analogue software return 1 left channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0100, 0x4001, &com_buff, 2); com_buff[0] = 0x00; com_buff[1] = 0x80; /* Analogue software return 1 right channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0101, 0x4001, &com_buff, 2); com_buff[0] = 0x00; com_buff[1] = 0x80; /* Analogue software return 2 left channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0104, 0x4001, &com_buff, 2); com_buff[0] = 0x00; com_buff[1] = 0x00; /* Analogue software return 2 right channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0105, 0x4001, &com_buff, 2); com_buff[0] = 0x00; com_buff[1] = 0x80; /* Analogue software return 3 left channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0108, 0x4001, &com_buff, 2); /* Analogue software return 3 right channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0109, 0x4001, &com_buff, 2); /* Analogue software return 4 left channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x010c, 0x4001, &com_buff, 2); /* Analogue software return 4 right channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x010d, 0x4001, &com_buff, 2); /* Return to muting sends */ com_buff[0] = 0x00; com_buff[1] = 0x80; /* Analogue fx return left channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0120, 0x4001, &com_buff, 2); /* Analogue fx return right channel: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0121, 0x4001, &com_buff, 2); /* Analogue software input 1 fx send: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0100, 0x4201, &com_buff, 2); /* Analogue software input 2 fx send: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0101, 0x4201, &com_buff, 2); /* Analogue software input 3 fx send: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0102, 0x4201, &com_buff, 2); /* Analogue software input 4 fx send: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0103, 0x4201, &com_buff, 2); /* Analogue input 1 fx send: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0104, 0x4201, &com_buff, 2); /* Analogue input 2 fx send: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0105, 0x4201, &com_buff, 2); /* Analogue input 3 fx send: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0106, 0x4201, &com_buff, 2); /* Analogue input 4 fx send: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0107, 0x4201, &com_buff, 2); /* Toggle allowing host control */ /* Not needed com_buff[0] = 0x02; snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 3, 0x21, 0x0000, 0x2001, &com_buff, 1); */ /* Do not dim fx returns */ com_buff[0] = 0x00; snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 3, 0x21, 0x0002, 0x2001, &com_buff, 1); /* Do not set fx returns to mono */ com_buff[0] = 0x00; snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 3, 0x21, 0x0001, 0x2001, &com_buff, 1); /* Mute the S/PDIF hardware loopback * same odd volume logic here as above */ com_buff[0] = 0x00; com_buff[1] = 0x80; /* S/PDIF hardware input 1 left channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0112, 0x4001, &com_buff, 2); /* S/PDIF hardware input 1 right channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0113, 0x4001, &com_buff, 2); /* S/PDIF hardware input 2 left channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0116, 0x4001, &com_buff, 2); /* S/PDIF hardware input 2 right channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0117, 0x4001, &com_buff, 2); /* S/PDIF hardware input 3 left channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x011a, 0x4001, &com_buff, 2); /* S/PDIF hardware input 3 right channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x011b, 0x4001, &com_buff, 2); /* S/PDIF hardware input 4 left channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x011e, 0x4001, &com_buff, 2); /* S/PDIF hardware input 4 right channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x011f, 0x4001, &com_buff, 2); /* S/PDIF software return 1 left channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0102, 0x4001, &com_buff, 2); /* S/PDIF software return 1 right channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0103, 0x4001, &com_buff, 2); /* S/PDIF software return 2 left channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0106, 0x4001, &com_buff, 2); /* S/PDIF software return 2 right channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0107, 0x4001, &com_buff, 2); com_buff[0] = 0x00; com_buff[1] = 0x00; /* S/PDIF software return 3 left channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x010a, 0x4001, &com_buff, 2); com_buff[0] = 0x00; com_buff[1] = 0x80; /* S/PDIF software return 3 right channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x010b, 0x4001, &com_buff, 2); /* S/PDIF software return 4 left channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x010e, 0x4001, &com_buff, 2); com_buff[0] = 0x00; com_buff[1] = 0x00; /* S/PDIF software return 4 right channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x010f, 0x4001, &com_buff, 2); com_buff[0] = 0x00; com_buff[1] = 0x80; /* S/PDIF fx returns left channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0122, 0x4001, &com_buff, 2); /* S/PDIF fx returns right channel */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0123, 0x4001, &com_buff, 2); /* Set the dropdown "Effect" to the first option */ /* Room1 = 0x00 */ /* Room2 = 0x01 */ /* Room3 = 0x02 */ /* Hall 1 = 0x03 */ /* Hall 2 = 0x04 */ /* Plate = 0x05 */ /* Delay = 0x06 */ /* Echo = 0x07 */ com_buff[0] = 0x00; snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0200, 0x4301, &com_buff, 1); /* max is 0xff */ /* min is 0x00 */ /* Set the effect duration to 0 */ /* max is 0xffff */ /* min is 0x0000 */ com_buff[0] = 0x00; com_buff[1] = 0x00; snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0400, 0x4301, &com_buff, 2); /* Set the effect volume and feedback to 0 */ /* max is 0xff */ /* min is 0x00 */ com_buff[0] = 0x00; /* feedback: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0500, 0x4301, &com_buff, 1); /* volume: */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 1, 0x21, 0x0300, 0x4301, &com_buff, 1); /* Set soft button hold duration */ /* 0x03 = 250ms */ /* 0x05 = 500ms DEFAULT */ /* 0x08 = 750ms */ /* 0x0a = 1sec */ com_buff[0] = 0x05; snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 3, 0x21, 0x0005, 0x2001, &com_buff, 1); /* Use dim LEDs for button of state */ com_buff[0] = 0x00; snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 3, 0x21, 0x0004, 0x2001, &com_buff, 1); } #define MBOX3_DESCRIPTOR_SIZE 464 static int snd_usb_mbox3_boot_quirk(struct usb_device *dev) { struct usb_host_config *config = dev->actconfig; struct usb_device_descriptor *new_device_descriptor __free(kfree) = NULL; int err; int descriptor_size; descriptor_size = le16_to_cpu(get_cfg_desc(config)->wTotalLength); if (descriptor_size != MBOX3_DESCRIPTOR_SIZE) { dev_err(&dev->dev, "MBOX3: Invalid descriptor size=%d.\n", descriptor_size); return -ENODEV; } dev_dbg(&dev->dev, "MBOX3: device initialised!\n"); new_device_descriptor = kmalloc(sizeof(*new_device_descriptor), GFP_KERNEL); if (!new_device_descriptor) return -ENOMEM; err = usb_get_descriptor(dev, USB_DT_DEVICE, 0, new_device_descriptor, sizeof(*new_device_descriptor)); if (err < 0) dev_dbg(&dev->dev, "MBOX3: error usb_get_descriptor: %d\n", err); if (new_device_descriptor->bNumConfigurations > dev->descriptor.bNumConfigurations) dev_dbg(&dev->dev, "MBOX3: error too large bNumConfigurations: %d\n", new_device_descriptor->bNumConfigurations); else memcpy(&dev->descriptor, new_device_descriptor, sizeof(dev->descriptor)); err = usb_reset_configuration(dev); if (err < 0) dev_dbg(&dev->dev, "MBOX3: error usb_reset_configuration: %d\n", err); dev_dbg(&dev->dev, "MBOX3: new boot length = %d\n", le16_to_cpu(get_cfg_desc(config)->wTotalLength)); mbox3_setup_defaults(dev); dev_info(&dev->dev, "MBOX3: Initialized."); return 0; /* Successful boot */ } #define MICROBOOK_BUF_SIZE 128 static int snd_usb_motu_microbookii_communicate(struct usb_device *dev, u8 *buf, int buf_size, int *length) { int err, actual_length; if (usb_pipe_type_check(dev, usb_sndintpipe(dev, 0x01))) return -EINVAL; err = usb_interrupt_msg(dev, usb_sndintpipe(dev, 0x01), buf, *length, &actual_length, 1000); if (err < 0) return err; print_hex_dump(KERN_DEBUG, "MicroBookII snd: ", DUMP_PREFIX_NONE, 16, 1, buf, actual_length, false); memset(buf, 0, buf_size); if (usb_pipe_type_check(dev, usb_rcvintpipe(dev, 0x82))) return -EINVAL; err = usb_interrupt_msg(dev, usb_rcvintpipe(dev, 0x82), buf, buf_size, &actual_length, 1000); if (err < 0) return err; print_hex_dump(KERN_DEBUG, "MicroBookII rcv: ", DUMP_PREFIX_NONE, 16, 1, buf, actual_length, false); *length = actual_length; return 0; } static int snd_usb_motu_microbookii_boot_quirk(struct usb_device *dev) { int err, actual_length, poll_attempts = 0; static const u8 set_samplerate_seq[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0b, 0x14, 0x00, 0x00, 0x00, 0x01 }; static const u8 poll_ready_seq[] = { 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x0b, 0x18 }; u8 *buf = kzalloc(MICROBOOK_BUF_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; dev_info(&dev->dev, "Waiting for MOTU Microbook II to boot up...\n"); /* First we tell the device which sample rate to use. */ memcpy(buf, set_samplerate_seq, sizeof(set_samplerate_seq)); actual_length = sizeof(set_samplerate_seq); err = snd_usb_motu_microbookii_communicate(dev, buf, MICROBOOK_BUF_SIZE, &actual_length); if (err < 0) { dev_err(&dev->dev, "failed setting the sample rate for Motu MicroBook II: %d\n", err); goto free_buf; } /* Then we poll every 100 ms until the device informs of its readiness. */ while (true) { if (++poll_attempts > 100) { dev_err(&dev->dev, "failed booting Motu MicroBook II: timeout\n"); err = -ENODEV; goto free_buf; } memset(buf, 0, MICROBOOK_BUF_SIZE); memcpy(buf, poll_ready_seq, sizeof(poll_ready_seq)); actual_length = sizeof(poll_ready_seq); err = snd_usb_motu_microbookii_communicate( dev, buf, MICROBOOK_BUF_SIZE, &actual_length); if (err < 0) { dev_err(&dev->dev, "failed booting Motu MicroBook II: communication error %d\n", err); goto free_buf; } /* the device signals its readiness through a message of the * form * XX 06 00 00 00 00 0b 18 00 00 00 01 * If the device is not yet ready to accept audio data, the * last byte of that sequence is 00. */ if (actual_length == 12 && buf[actual_length - 1] == 1) break; msleep(100); } dev_info(&dev->dev, "MOTU MicroBook II ready\n"); free_buf: kfree(buf); return err; } static int snd_usb_motu_m_series_boot_quirk(struct usb_device *dev) { msleep(4000); return 0; } static int snd_usb_rme_digiface_boot_quirk(struct usb_device *dev) { /* Disable mixer, internal clock, all outputs ADAT, 48kHz, TMS off */ snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 16, 0x40, 0x2410, 0x7fff, NULL, 0); snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 18, 0x40, 0x0104, 0xffff, NULL, 0); /* Disable loopback for all inputs */ for (int ch = 0; ch < 32; ch++) snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 22, 0x40, 0x400, ch, NULL, 0); /* Unity gain for all outputs */ for (int ch = 0; ch < 34; ch++) snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 21, 0x40, 0x9000, 0x100 + ch, NULL, 0); return 0; } /* * Setup quirks */ #define MAUDIO_SET 0x01 /* parse device_setup */ #define MAUDIO_SET_COMPATIBLE 0x80 /* use only "win-compatible" interfaces */ #define MAUDIO_SET_DTS 0x02 /* enable DTS Digital Output */ #define MAUDIO_SET_96K 0x04 /* 48-96kHz rate if set, 8-48kHz otherwise */ #define MAUDIO_SET_24B 0x08 /* 24bits sample if set, 16bits otherwise */ #define MAUDIO_SET_DI 0x10 /* enable Digital Input */ #define MAUDIO_SET_MASK 0x1f /* bit mask for setup value */ #define MAUDIO_SET_24B_48K_DI 0x19 /* 24bits+48kHz+Digital Input */ #define MAUDIO_SET_24B_48K_NOTDI 0x09 /* 24bits+48kHz+No Digital Input */ #define MAUDIO_SET_16B_48K_DI 0x11 /* 16bits+48kHz+Digital Input */ #define MAUDIO_SET_16B_48K_NOTDI 0x01 /* 16bits+48kHz+No Digital Input */ static int quattro_skip_setting_quirk(struct snd_usb_audio *chip, int iface, int altno) { /* Reset ALL ifaces to 0 altsetting. * Call it for every possible altsetting of every interface. */ usb_set_interface(chip->dev, iface, 0); if (chip->setup & MAUDIO_SET) { if (chip->setup & MAUDIO_SET_COMPATIBLE) { if (iface != 1 && iface != 2) return 1; /* skip all interfaces but 1 and 2 */ } else { unsigned int mask; if (iface == 1 || iface == 2) return 1; /* skip interfaces 1 and 2 */ if ((chip->setup & MAUDIO_SET_96K) && altno != 1) return 1; /* skip this altsetting */ mask = chip->setup & MAUDIO_SET_MASK; if (mask == MAUDIO_SET_24B_48K_DI && altno != 2) return 1; /* skip this altsetting */ if (mask == MAUDIO_SET_24B_48K_NOTDI && altno != 3) return 1; /* skip this altsetting */ if (mask == MAUDIO_SET_16B_48K_NOTDI && altno != 4) return 1; /* skip this altsetting */ } } usb_audio_dbg(chip, "using altsetting %d for interface %d config %d\n", altno, iface, chip->setup); return 0; /* keep this altsetting */ } static int audiophile_skip_setting_quirk(struct snd_usb_audio *chip, int iface, int altno) { /* Reset ALL ifaces to 0 altsetting. * Call it for every possible altsetting of every interface. */ usb_set_interface(chip->dev, iface, 0); if (chip->setup & MAUDIO_SET) { unsigned int mask; if ((chip->setup & MAUDIO_SET_DTS) && altno != 6) return 1; /* skip this altsetting */ if ((chip->setup & MAUDIO_SET_96K) && altno != 1) return 1; /* skip this altsetting */ mask = chip->setup & MAUDIO_SET_MASK; if (mask == MAUDIO_SET_24B_48K_DI && altno != 2) return 1; /* skip this altsetting */ if (mask == MAUDIO_SET_24B_48K_NOTDI && altno != 3) return 1; /* skip this altsetting */ if (mask == MAUDIO_SET_16B_48K_DI && altno != 4) return 1; /* skip this altsetting */ if (mask == MAUDIO_SET_16B_48K_NOTDI && altno != 5) return 1; /* skip this altsetting */ } return 0; /* keep this altsetting */ } static int fasttrackpro_skip_setting_quirk(struct snd_usb_audio *chip, int iface, int altno) { /* Reset ALL ifaces to 0 altsetting. * Call it for every possible altsetting of every interface. */ usb_set_interface(chip->dev, iface, 0); /* possible configuration where both inputs and only one output is *used is not supported by the current setup */ if (chip->setup & (MAUDIO_SET | MAUDIO_SET_24B)) { if (chip->setup & MAUDIO_SET_96K) { if (altno != 3 && altno != 6) return 1; } else if (chip->setup & MAUDIO_SET_DI) { if (iface == 4) return 1; /* no analog input */ if (altno != 2 && altno != 5) return 1; /* enable only altsets 2 and 5 */ } else { if (iface == 5) return 1; /* disable digialt input */ if (altno != 2 && altno != 5) return 1; /* enalbe only altsets 2 and 5 */ } } else { /* keep only 16-Bit mode */ if (altno != 1) return 1; } usb_audio_dbg(chip, "using altsetting %d for interface %d config %d\n", altno, iface, chip->setup); return 0; /* keep this altsetting */ } static int s1810c_skip_setting_quirk(struct snd_usb_audio *chip, int iface, int altno) { /* * Altno settings: * * Playback (Interface 1): * 1: 6 Analog + 2 S/PDIF * 2: 6 Analog + 2 S/PDIF * 3: 6 Analog * * Capture (Interface 2): * 1: 8 Analog + 2 S/PDIF + 8 ADAT * 2: 8 Analog + 2 S/PDIF + 4 ADAT * 3: 8 Analog */ /* * I'll leave 2 as the default one and * use device_setup to switch to the * other two. */ if ((chip->setup == 0 || chip->setup > 2) && altno != 2) return 1; else if (chip->setup == 1 && altno != 1) return 1; else if (chip->setup == 2 && altno != 3) return 1; return 0; } int snd_usb_apply_interface_quirk(struct snd_usb_audio *chip, int iface, int altno) { /* audiophile usb: skip altsets incompatible with device_setup */ if (chip->usb_id == USB_ID(0x0763, 0x2003)) return audiophile_skip_setting_quirk(chip, iface, altno); /* quattro usb: skip altsets incompatible with device_setup */ if (chip->usb_id == USB_ID(0x0763, 0x2001)) return quattro_skip_setting_quirk(chip, iface, altno); /* fasttrackpro usb: skip altsets incompatible with device_setup */ if (chip->usb_id == USB_ID(0x0763, 0x2012)) return fasttrackpro_skip_setting_quirk(chip, iface, altno); /* presonus studio 1810c: skip altsets incompatible with device_setup */ if (chip->usb_id == USB_ID(0x194f, 0x010c)) return s1810c_skip_setting_quirk(chip, iface, altno); return 0; } int snd_usb_apply_boot_quirk(struct usb_device *dev, struct usb_interface *intf, const struct snd_usb_audio_quirk *quirk, unsigned int id) { switch (id) { case USB_ID(0x041e, 0x3000): /* SB Extigy needs special boot-up sequence */ /* if more models come, this will go to the quirk list. */ return snd_usb_extigy_boot_quirk(dev, intf); case USB_ID(0x041e, 0x3020): /* SB Audigy 2 NX needs its own boot-up magic, too */ return snd_usb_audigy2nx_boot_quirk(dev); case USB_ID(0x10f5, 0x0200): /* C-Media CM106 / Turtle Beach Audio Advantage Roadie */ return snd_usb_cm106_boot_quirk(dev); case USB_ID(0x0d8c, 0x0102): /* C-Media CM6206 / CM106-Like Sound Device */ case USB_ID(0x0ccd, 0x00b1): /* Terratec Aureon 7.1 USB */ return snd_usb_cm6206_boot_quirk(dev); case USB_ID(0x0dba, 0x3000): /* Digidesign Mbox 2 */ return snd_usb_mbox2_boot_quirk(dev); case USB_ID(0x0dba, 0x5000): /* Digidesign Mbox 3 */ return snd_usb_mbox3_boot_quirk(dev); case USB_ID(0x1235, 0x0010): /* Focusrite Novation Saffire 6 USB */ case USB_ID(0x1235, 0x0018): /* Focusrite Novation Twitch */ return snd_usb_novation_boot_quirk(dev); case USB_ID(0x133e, 0x0815): /* Access Music VirusTI Desktop */ return snd_usb_accessmusic_boot_quirk(dev); case USB_ID(0x17cc, 0x1000): /* Komplete Audio 6 */ case USB_ID(0x17cc, 0x1010): /* Traktor Audio 6 */ case USB_ID(0x17cc, 0x1020): /* Traktor Audio 10 */ return snd_usb_nativeinstruments_boot_quirk(dev); case USB_ID(0x0763, 0x2012): /* M-Audio Fast Track Pro USB */ return snd_usb_fasttrackpro_boot_quirk(dev); case USB_ID(0x047f, 0xc010): /* Plantronics Gamecom 780 */ return snd_usb_gamecon780_boot_quirk(dev); case USB_ID(0x2466, 0x8010): /* Fractal Audio Axe-Fx 3 */ return snd_usb_axefx3_boot_quirk(dev); case USB_ID(0x07fd, 0x0004): /* MOTU MicroBook II */ /* * For some reason interface 3 with vendor-spec class is * detected on MicroBook IIc. */ if (get_iface_desc(intf->altsetting)->bInterfaceClass == USB_CLASS_VENDOR_SPEC && get_iface_desc(intf->altsetting)->bInterfaceNumber < 3) return snd_usb_motu_microbookii_boot_quirk(dev); break; case USB_ID(0x2a39, 0x3f8c): /* RME Digiface USB */ case USB_ID(0x2a39, 0x3fa0): /* RME Digiface USB (alternate) */ return snd_usb_rme_digiface_boot_quirk(dev); } return 0; } int snd_usb_apply_boot_quirk_once(struct usb_device *dev, struct usb_interface *intf, const struct snd_usb_audio_quirk *quirk, unsigned int id) { switch (id) { case USB_ID(0x07fd, 0x0008): /* MOTU M Series, 1st hardware version */ return snd_usb_motu_m_series_boot_quirk(dev); } return 0; } /* * check if the device uses big-endian samples */ int snd_usb_is_big_endian_format(struct snd_usb_audio *chip, const struct audioformat *fp) { /* it depends on altsetting whether the device is big-endian or not */ switch (chip->usb_id) { case USB_ID(0x0763, 0x2001): /* M-Audio Quattro: captured data only */ if (fp->altsetting == 2 || fp->altsetting == 3 || fp->altsetting == 5 || fp->altsetting == 6) return 1; break; case USB_ID(0x0763, 0x2003): /* M-Audio Audiophile USB */ if (chip->setup == 0x00 || fp->altsetting == 1 || fp->altsetting == 2 || fp->altsetting == 3) return 1; break; case USB_ID(0x0763, 0x2012): /* M-Audio Fast Track Pro */ if (fp->altsetting == 2 || fp->altsetting == 3 || fp->altsetting == 5 || fp->altsetting == 6) return 1; break; } return 0; } /* * For E-Mu 0404USB/0202USB/TrackerPre/0204 sample rate should be set for device, * not for interface. */ enum { EMU_QUIRK_SR_44100HZ = 0, EMU_QUIRK_SR_48000HZ, EMU_QUIRK_SR_88200HZ, EMU_QUIRK_SR_96000HZ, EMU_QUIRK_SR_176400HZ, EMU_QUIRK_SR_192000HZ }; static void set_format_emu_quirk(struct snd_usb_substream *subs, const struct audioformat *fmt) { unsigned char emu_samplerate_id = 0; /* When capture is active * sample rate shouldn't be changed * by playback substream */ if (subs->direction == SNDRV_PCM_STREAM_PLAYBACK) { if (subs->stream->substream[SNDRV_PCM_STREAM_CAPTURE].cur_audiofmt) return; } switch (fmt->rate_min) { case 48000: emu_samplerate_id = EMU_QUIRK_SR_48000HZ; break; case 88200: emu_samplerate_id = EMU_QUIRK_SR_88200HZ; break; case 96000: emu_samplerate_id = EMU_QUIRK_SR_96000HZ; break; case 176400: emu_samplerate_id = EMU_QUIRK_SR_176400HZ; break; case 192000: emu_samplerate_id = EMU_QUIRK_SR_192000HZ; break; default: emu_samplerate_id = EMU_QUIRK_SR_44100HZ; break; } snd_emuusb_set_samplerate(subs->stream->chip, emu_samplerate_id); subs->pkt_offset_adj = (emu_samplerate_id >= EMU_QUIRK_SR_176400HZ) ? 4 : 0; } static int pioneer_djm_set_format_quirk(struct snd_usb_substream *subs, u16 windex) { unsigned int cur_rate = subs->data_endpoint->cur_rate; u8 sr[3]; // Convert to little endian sr[0] = cur_rate & 0xff; sr[1] = (cur_rate >> 8) & 0xff; sr[2] = (cur_rate >> 16) & 0xff; usb_set_interface(subs->dev, 0, 1); // we should derive windex from fmt-sync_ep but it's not set snd_usb_ctl_msg(subs->stream->chip->dev, usb_sndctrlpipe(subs->stream->chip->dev, 0), 0x01, 0x22, 0x0100, windex, &sr, 0x0003); return 0; } static void mbox3_set_format_quirk(struct snd_usb_substream *subs, const struct audioformat *fmt) { __le32 buff4 = 0; u8 buff1 = 0x01; u32 new_rate = subs->data_endpoint->cur_rate; u32 current_rate; // Get current rate from card and check if changing it is needed snd_usb_ctl_msg(subs->dev, usb_rcvctrlpipe(subs->dev, 0), 0x01, 0x21 | USB_DIR_IN, 0x0100, 0x8101, &buff4, 4); current_rate = le32_to_cpu(buff4); dev_dbg(&subs->dev->dev, "MBOX3: Current configured sample rate: %d", current_rate); if (current_rate == new_rate) { dev_dbg(&subs->dev->dev, "MBOX3: No change needed (current rate:%d == new rate:%d)", current_rate, new_rate); return; } // Set new rate dev_info(&subs->dev->dev, "MBOX3: Changing sample rate to: %d", new_rate); buff4 = cpu_to_le32(new_rate); snd_usb_ctl_msg(subs->dev, usb_sndctrlpipe(subs->dev, 0), 0x01, 0x21, 0x0100, 0x8101, &buff4, 4); // Set clock source to Internal snd_usb_ctl_msg(subs->dev, usb_sndctrlpipe(subs->dev, 0), 0x01, 0x21, 0x0100, 0x8001, &buff1, 1); // Check whether the change was successful buff4 = 0; snd_usb_ctl_msg(subs->dev, usb_rcvctrlpipe(subs->dev, 0), 0x01, 0x21 | USB_DIR_IN, 0x0100, 0x8101, &buff4, 4); if (new_rate != le32_to_cpu(buff4)) dev_warn(&subs->dev->dev, "MBOX3: Couldn't set the sample rate"); } static const int rme_digiface_rate_table[] = { 32000, 44100, 48000, 0, 64000, 88200, 96000, 0, 128000, 176400, 192000, 0, }; static int rme_digiface_set_format_quirk(struct snd_usb_substream *subs) { unsigned int cur_rate = subs->data_endpoint->cur_rate; u16 val; int speed_mode; int id; for (id = 0; id < ARRAY_SIZE(rme_digiface_rate_table); id++) { if (rme_digiface_rate_table[id] == cur_rate) break; } if (id >= ARRAY_SIZE(rme_digiface_rate_table)) return -EINVAL; /* 2, 3, 4 for 1x, 2x, 4x */ speed_mode = (id >> 2) + 2; val = (id << 3) | (speed_mode << 12); /* Set the sample rate */ snd_usb_ctl_msg(subs->stream->chip->dev, usb_sndctrlpipe(subs->stream->chip->dev, 0), 16, 0x40, val, 0x7078, NULL, 0); return 0; } void snd_usb_set_format_quirk(struct snd_usb_substream *subs, const struct audioformat *fmt) { switch (subs->stream->chip->usb_id) { case USB_ID(0x041e, 0x3f02): /* E-Mu 0202 USB */ case USB_ID(0x041e, 0x3f04): /* E-Mu 0404 USB */ case USB_ID(0x041e, 0x3f0a): /* E-Mu Tracker Pre */ case USB_ID(0x041e, 0x3f19): /* E-Mu 0204 USB */ set_format_emu_quirk(subs, fmt); break; case USB_ID(0x534d, 0x0021): /* MacroSilicon MS2100/MS2106 */ case USB_ID(0x534d, 0x2109): /* MacroSilicon MS2109 */ subs->stream_offset_adj = 2; break; case USB_ID(0x2b73, 0x000a): /* Pioneer DJM-900NXS2 */ case USB_ID(0x2b73, 0x0013): /* Pioneer DJM-450 */ case USB_ID(0x2b73, 0x0034): /* Pioneer DJM-V10 */ pioneer_djm_set_format_quirk(subs, 0x0082); break; case USB_ID(0x08e4, 0x017f): /* Pioneer DJM-750 */ case USB_ID(0x08e4, 0x0163): /* Pioneer DJM-850 */ pioneer_djm_set_format_quirk(subs, 0x0086); break; case USB_ID(0x0dba, 0x5000): mbox3_set_format_quirk(subs, fmt); /* Digidesign Mbox 3 */ break; case USB_ID(0x2a39, 0x3f8c): /* RME Digiface USB */ case USB_ID(0x2a39, 0x3fa0): /* RME Digiface USB (alternate) */ rme_digiface_set_format_quirk(subs); break; } } int snd_usb_select_mode_quirk(struct snd_usb_audio *chip, const struct audioformat *fmt) { struct usb_device *dev = chip->dev; int err; if (chip->quirk_flags & QUIRK_FLAG_ITF_USB_DSD_DAC) { /* First switch to alt set 0, otherwise the mode switch cmd * will not be accepted by the DAC */ err = usb_set_interface(dev, fmt->iface, 0); if (err < 0) return err; msleep(20); /* Delay needed after setting the interface */ /* Vendor mode switch cmd is required. */ if (fmt->formats & SNDRV_PCM_FMTBIT_DSD_U32_BE) { /* DSD mode (DSD_U32) requested */ err = snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 0, USB_DIR_OUT|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, 1, 1, NULL, 0); if (err < 0) return err; } else { /* PCM or DOP mode (S32) requested */ /* PCM mode (S16) requested */ err = snd_usb_ctl_msg(dev, usb_sndctrlpipe(dev, 0), 0, USB_DIR_OUT|USB_TYPE_VENDOR|USB_RECIP_INTERFACE, 0, 1, NULL, 0); if (err < 0) return err; } msleep(20); } return 0; } void snd_usb_endpoint_start_quirk(struct snd_usb_endpoint *ep) { /* * "Playback Design" products send bogus feedback data at the start * of the stream. Ignore them. */ if (USB_ID_VENDOR(ep->chip->usb_id) == 0x23ba && ep->type == SND_USB_ENDPOINT_TYPE_SYNC) ep->skip_packets = 4; /* * M-Audio Fast Track C400/C600 - when packets are not skipped, real * world latency varies by approx. +/- 50 frames (at 96kHz) each time * the stream is (re)started. When skipping packets 16 at endpoint * start up, the real world latency is stable within +/- 1 frame (also * across power cycles). */ if ((ep->chip->usb_id == USB_ID(0x0763, 0x2030) || ep->chip->usb_id == USB_ID(0x0763, 0x2031)) && ep->type == SND_USB_ENDPOINT_TYPE_DATA) ep->skip_packets = 16; /* Work around devices that report unreasonable feedback data */ if ((ep->chip->usb_id == USB_ID(0x0644, 0x8038) || /* TEAC UD-H01 */ ep->chip->usb_id == USB_ID(0x1852, 0x5034)) && /* T+A Dac8 */ ep->syncmaxsize == 4) ep->tenor_fb_quirk = 1; } /* quirk applied after snd_usb_ctl_msg(); not applied during boot quirks */ void snd_usb_ctl_msg_quirk(struct usb_device *dev, unsigned int pipe, __u8 request, __u8 requesttype, __u16 value, __u16 index, void *data, __u16 size) { struct snd_usb_audio *chip = dev_get_drvdata(&dev->dev); if (!chip || (requesttype & USB_TYPE_MASK) != USB_TYPE_CLASS) return; if (chip->quirk_flags & QUIRK_FLAG_CTL_MSG_DELAY) msleep(20); else if (chip->quirk_flags & QUIRK_FLAG_CTL_MSG_DELAY_1M) usleep_range(1000, 2000); else if (chip->quirk_flags & QUIRK_FLAG_CTL_MSG_DELAY_5M) usleep_range(5000, 6000); } /* * snd_usb_interface_dsd_format_quirks() is called from format.c to * augment the PCM format bit-field for DSD types. The UAC standards * don't have a designated bit field to denote DSD-capable interfaces, * hence all hardware that is known to support this format has to be * listed here. */ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, struct audioformat *fp, unsigned int sample_bytes) { struct usb_interface *iface; /* Playback Designs */ if (USB_ID_VENDOR(chip->usb_id) == 0x23ba && USB_ID_PRODUCT(chip->usb_id) < 0x0110) { switch (fp->altsetting) { case 1: fp->dsd_dop = true; return SNDRV_PCM_FMTBIT_DSD_U16_LE; case 2: fp->dsd_bitrev = true; return SNDRV_PCM_FMTBIT_DSD_U8; case 3: fp->dsd_bitrev = true; return SNDRV_PCM_FMTBIT_DSD_U16_LE; } } /* XMOS based USB DACs */ switch (chip->usb_id) { case USB_ID(0x139f, 0x5504): /* Nagra DAC */ case USB_ID(0x20b1, 0x3089): /* Mola-Mola DAC */ case USB_ID(0x2522, 0x0007): /* LH Labs Geek Out 1V5 */ case USB_ID(0x2522, 0x0009): /* LH Labs Geek Pulse X Inifinity 2V0 */ case USB_ID(0x2522, 0x0012): /* LH Labs VI DAC Infinity */ case USB_ID(0x2772, 0x0230): /* Pro-Ject Pre Box S2 Digital */ if (fp->altsetting == 2) return SNDRV_PCM_FMTBIT_DSD_U32_BE; break; case USB_ID(0x0d8c, 0x0316): /* Hegel HD12 DSD */ case USB_ID(0x10cb, 0x0103): /* The Bit Opus #3; with fp->dsd_raw */ case USB_ID(0x16d0, 0x06b2): /* NuPrime DAC-10 */ case USB_ID(0x16d0, 0x06b4): /* NuPrime Audio HD-AVP/AVA */ case USB_ID(0x16d0, 0x0733): /* Furutech ADL Stratos */ case USB_ID(0x16d0, 0x09d8): /* NuPrime IDA-8 */ case USB_ID(0x16d0, 0x09db): /* NuPrime Audio DAC-9 */ case USB_ID(0x16d0, 0x09dd): /* Encore mDSD */ case USB_ID(0x16d0, 0x0ab1): /* PureAudio APA DAC */ case USB_ID(0x16d0, 0xeca1): /* PureAudio Lotus DAC5, DAC5 SE, DAC5 Pro */ case USB_ID(0x1db5, 0x0003): /* Bryston BDA3 */ case USB_ID(0x20a0, 0x4143): /* WaveIO USB Audio 2.0 */ case USB_ID(0x22e1, 0xca01): /* HDTA Serenade DSD */ case USB_ID(0x249c, 0x9326): /* M2Tech Young MkIII */ case USB_ID(0x2616, 0x0106): /* PS Audio NuWave DAC */ case USB_ID(0x2622, 0x0041): /* Audiolab M-DAC+ */ case USB_ID(0x2622, 0x0061): /* LEAK Stereo 230 */ case USB_ID(0x278b, 0x5100): /* Rotel RC-1590 */ case USB_ID(0x27f7, 0x3002): /* W4S DAC-2v2SE */ case USB_ID(0x29a2, 0x0086): /* Mutec MC3+ USB */ case USB_ID(0x6b42, 0x0042): /* MSB Technology */ if (fp->altsetting == 3) return SNDRV_PCM_FMTBIT_DSD_U32_BE; break; /* Amanero Combo384 USB based DACs with native DSD support */ case USB_ID(0x16d0, 0x071a): /* Amanero - Combo384 */ if (fp->altsetting == 2) { switch (le16_to_cpu(chip->dev->descriptor.bcdDevice)) { case 0x199: return SNDRV_PCM_FMTBIT_DSD_U32_LE; case 0x19b: case 0x203: return SNDRV_PCM_FMTBIT_DSD_U32_BE; default: break; } } break; case USB_ID(0x16d0, 0x0a23): if (fp->altsetting == 2) return SNDRV_PCM_FMTBIT_DSD_U32_BE; break; default: break; } /* ITF-USB DSD based DACs */ if (chip->quirk_flags & QUIRK_FLAG_ITF_USB_DSD_DAC) { iface = usb_ifnum_to_if(chip->dev, fp->iface); /* Altsetting 2 support native DSD if the num of altsets is * three (0-2), * Altsetting 3 support native DSD if the num of altsets is * four (0-3). */ if (fp->altsetting == iface->num_altsetting - 1) return SNDRV_PCM_FMTBIT_DSD_U32_BE; } /* Mostly generic method to detect many DSD-capable implementations */ if ((chip->quirk_flags & QUIRK_FLAG_DSD_RAW) && fp->dsd_raw) return SNDRV_PCM_FMTBIT_DSD_U32_BE; return 0; } void snd_usb_audioformat_attributes_quirk(struct snd_usb_audio *chip, struct audioformat *fp, int stream) { switch (chip->usb_id) { case USB_ID(0x0a92, 0x0053): /* AudioTrak Optoplay */ /* Optoplay sets the sample rate attribute although * it seems not supporting it in fact. */ fp->attributes &= ~UAC_EP_CS_ATTR_SAMPLE_RATE; break; case USB_ID(0x041e, 0x3020): /* Creative SB Audigy 2 NX */ case USB_ID(0x0763, 0x2003): /* M-Audio Audiophile USB */ /* doesn't set the sample rate attribute, but supports it */ fp->attributes |= UAC_EP_CS_ATTR_SAMPLE_RATE; break; case USB_ID(0x0763, 0x2001): /* M-Audio Quattro USB */ case USB_ID(0x0763, 0x2012): /* M-Audio Fast Track Pro USB */ case USB_ID(0x047f, 0x0ca1): /* plantronics headset */ case USB_ID(0x077d, 0x07af): /* Griffin iMic (note that there is an older model 77d:223) */ /* * plantronics headset and Griffin iMic have set adaptive-in * although it's really not... */ fp->ep_attr &= ~USB_ENDPOINT_SYNCTYPE; if (stream == SNDRV_PCM_STREAM_PLAYBACK) fp->ep_attr |= USB_ENDPOINT_SYNC_ADAPTIVE; else fp->ep_attr |= USB_ENDPOINT_SYNC_SYNC; break; case USB_ID(0x07fd, 0x0004): /* MOTU MicroBook IIc */ /* * MaxPacketsOnly attribute is erroneously set in endpoint * descriptors. As a result this card produces noise with * all sample rates other than 96 kHz. */ fp->attributes &= ~UAC_EP_CS_ATTR_FILL_MAX; break; case USB_ID(0x1224, 0x2a25): /* Jieli Technology USB PHY 2.0 */ /* mic works only when ep packet size is set to wMaxPacketSize */ fp->attributes |= UAC_EP_CS_ATTR_FILL_MAX; break; case USB_ID(0x3511, 0x2b1e): /* Opencomm2 UC USB Bluetooth dongle */ /* mic works only when ep pitch control is not set */ if (stream == SNDRV_PCM_STREAM_CAPTURE) fp->attributes &= ~UAC_EP_CS_ATTR_PITCH_CONTROL; break; } } /* * driver behavior quirk flags */ struct usb_audio_quirk_flags_table { u32 id; u32 flags; }; #define DEVICE_FLG(vid, pid, _flags) \ { .id = USB_ID(vid, pid), .flags = (_flags) } #define VENDOR_FLG(vid, _flags) DEVICE_FLG(vid, 0, _flags) static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { /* Device matches */ DEVICE_FLG(0x03f0, 0x654a, /* HP 320 FHD Webcam */ QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_MIC_RES_16), DEVICE_FLG(0x041e, 0x3000, /* Creative SB Extigy */ QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x041e, 0x4080, /* Creative Live Cam VF0610 */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x045e, 0x083c, /* MS USB Link headset */ QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_CTL_MSG_DELAY | QUIRK_FLAG_DISABLE_AUTOSUSPEND), DEVICE_FLG(0x045e, 0x070f, /* MS LifeChat LX-3000 Headset */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x046d, 0x0807, /* Logitech Webcam C500 */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x0808, /* Logitech Webcam C600 */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x0809, QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x0819, /* Logitech Webcam C210 */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x081b, /* HD Webcam c310 */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x081d, /* HD Webcam c510 */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x0825, /* HD Webcam c270 */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x0826, /* HD Webcam c525 */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x084c, /* Logitech ConferenceCam Connect */ QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x046d, 0x08ca, /* Logitech Quickcam Fusion */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x0991, /* Logitech QuickCam Pro */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_IGNORE_CTL_ERROR | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x09a2, /* QuickCam Communicate Deluxe/S7500 */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIC_RES_384), DEVICE_FLG(0x046d, 0x09a4, /* Logitech QuickCam E 3500 */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x046d, 0x0a8f, /* Logitech H390 headset */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x0499, 0x1506, /* Yamaha THR5 */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x0499, 0x1509, /* Steinberg UR22 */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x0499, 0x3108, /* Yamaha YIT-W12TX */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x04d8, 0xfeea, /* Benchmark DAC1 Pre */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x04e8, 0xa051, /* Samsung USBC Headset (AKG) */ QUIRK_FLAG_SKIP_CLOCK_SELECTOR | QUIRK_FLAG_CTL_MSG_DELAY_5M), DEVICE_FLG(0x0525, 0xa4ad, /* Hamedal C20 usb camero */ QUIRK_FLAG_IFACE_SKIP_CLOSE), DEVICE_FLG(0x054c, 0x0b8c, /* Sony WALKMAN NW-A45 DAC */ QUIRK_FLAG_SET_IFACE_FIRST), DEVICE_FLG(0x0556, 0x0014, /* Phoenix Audio TMX320VC */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x0572, 0x1b08, /* Conexant Systems (Rockwell), Inc. */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x0572, 0x1b09, /* Conexant Systems (Rockwell), Inc. */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x05a3, 0x9420, /* ELP HD USB Camera */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x05a7, 0x1020, /* Bose Companion 5 */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x05e1, 0x0408, /* Syntek STK1160 */ QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x05e1, 0x0480, /* Hauppauge Woodbury */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x0644, 0x8043, /* TEAC UD-501/UD-501V2/UD-503/NT-503 */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY | QUIRK_FLAG_IFACE_DELAY), DEVICE_FLG(0x0644, 0x8044, /* Esoteric D-05X */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY | QUIRK_FLAG_IFACE_DELAY), DEVICE_FLG(0x0644, 0x804a, /* TEAC UD-301 */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY | QUIRK_FLAG_IFACE_DELAY), DEVICE_FLG(0x0644, 0x805f, /* TEAC Model 12 */ QUIRK_FLAG_FORCE_IFACE_RESET), DEVICE_FLG(0x0644, 0x806b, /* TEAC UD-701 */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY | QUIRK_FLAG_IFACE_DELAY), DEVICE_FLG(0x06f8, 0xb000, /* Hercules DJ Console (Windows Edition) */ QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x06f8, 0xd002, /* Hercules DJ Console (Macintosh Edition) */ QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x0711, 0x5800, /* MCT Trigger 5 USB-to-HDMI */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x074d, 0x3553, /* Outlaw RR2150 (Micronas UAC3553B) */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x0763, 0x2030, /* M-Audio Fast Track C400 */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x0763, 0x2031, /* M-Audio Fast Track C600 */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x07fd, 0x000b, /* MOTU M Series 2nd hardware revision */ QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x08bb, 0x2702, /* LineX FM Transmitter */ QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x0951, 0x16ad, /* Kingston HyperX */ QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x0b05, 0x18a6, /* ASUSTek Computer, Inc. */ QUIRK_FLAG_MIXER_CAPTURE_MIN_MUTE), DEVICE_FLG(0x0b0e, 0x0349, /* Jabra 550a */ QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x0bda, 0x498a, /* Realtek Semiconductor Corp. */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE | QUIRK_FLAG_MIXER_CAPTURE_MIN_MUTE), DEVICE_FLG(0x0c45, 0x6340, /* Sonix HD USB Camera */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x0c45, 0x636b, /* Microdia JP001 USB Camera */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x0d8c, 0x000c, /* C-Media */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x0d8c, 0x0012, /* C-Media */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x0d8c, 0x0014, /* C-Media */ QUIRK_FLAG_CTL_MSG_DELAY_1M | QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x0ecb, 0x205c, /* JBL Quantum610 Wireless */ QUIRK_FLAG_FIXED_RATE), DEVICE_FLG(0x0ecb, 0x2069, /* JBL Quantum810 Wireless */ QUIRK_FLAG_FIXED_RATE), DEVICE_FLG(0x0fd9, 0x0008, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x1038, 0x1294, /* SteelSeries Arctis Pro Wireless */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x1101, 0x0003, /* Audioengine D1 */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x12d1, 0x3a07, /* Huawei Technologies Co., Ltd. */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE | QUIRK_FLAG_MIXER_CAPTURE_MIN_MUTE), DEVICE_FLG(0x1224, 0x2a25, /* Jieli Technology USB PHY 2.0 */ QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_MIC_RES_16), DEVICE_FLG(0x1395, 0x740a, /* Sennheiser DECT */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x1397, 0x0507, /* Behringer UMC202HD */ QUIRK_FLAG_PLAYBACK_FIRST | QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x1397, 0x0508, /* Behringer UMC204HD */ QUIRK_FLAG_PLAYBACK_FIRST | QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x1397, 0x0509, /* Behringer UMC404HD */ QUIRK_FLAG_PLAYBACK_FIRST | QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x13e5, 0x0001, /* Serato Phono */ QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x154e, 0x1002, /* Denon DCD-1500RE */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY), DEVICE_FLG(0x154e, 0x1003, /* Denon DA-300USB */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY), DEVICE_FLG(0x154e, 0x3005, /* Marantz HD-DAC1 */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY), DEVICE_FLG(0x154e, 0x3006, /* Marantz SA-14S1 */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY), DEVICE_FLG(0x154e, 0x300b, /* Marantz SA-KI RUBY / SA-12 */ QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x154e, 0x500e, /* Denon DN-X1600 */ QUIRK_FLAG_IGNORE_CLOCK_SOURCE), DEVICE_FLG(0x1686, 0x00dd, /* Zoom R16/24 */ QUIRK_FLAG_TX_LENGTH | QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x16d0, 0x0ab1, /* PureAudio APA DAC */ QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x16d0, 0xeca1, /* PureAudio Lotus DAC5, DAC5 SE and DAC5 Pro */ QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x17aa, 0x1046, /* Lenovo ThinkStation P620 Rear Line-in, Line-out and Microphone */ QUIRK_FLAG_DISABLE_AUTOSUSPEND), DEVICE_FLG(0x17aa, 0x104d, /* Lenovo ThinkStation P620 Internal Speaker + Front Headset */ QUIRK_FLAG_DISABLE_AUTOSUSPEND), DEVICE_FLG(0x17ef, 0x3083, /* Lenovo TBT3 dock */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x1852, 0x5062, /* Luxman D-08u */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY), DEVICE_FLG(0x1852, 0x5065, /* Luxman DA-06 */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY), DEVICE_FLG(0x1901, 0x0191, /* GE B850V3 CP2114 audio interface */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x19f7, 0x0003, /* RODE NT-USB */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x19f7, 0x0035, /* RODE NT-USB+ */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x1bcf, 0x2281, /* HD Webcam */ QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_MIC_RES_16), DEVICE_FLG(0x1bcf, 0x2283, /* NexiGo N930AF FHD Webcam */ QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_MIC_RES_16), DEVICE_FLG(0x2040, 0x7200, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7201, /* Hauppauge HVR-950Q-MXL */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7210, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7211, /* Hauppauge HVR-950Q-MXL */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7213, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7217, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x721b, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x721e, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x721f, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7240, /* Hauppauge HVR-850 */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7260, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7270, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7280, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x7281, /* Hauppauge HVR-950Q-MXL */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x2040, 0x8200, /* Hauppauge Woodbury */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x21b4, 0x0081, /* AudioQuest DragonFly */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x21b4, 0x0230, /* Ayre QB-9 Twenty */ QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x21b4, 0x0232, /* Ayre QX-5 Twenty */ QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x2522, 0x0007, /* LH Labs Geek Out HD Audio 1V5 */ QUIRK_FLAG_SET_IFACE_FIRST), DEVICE_FLG(0x262a, 0x9302, /* ddHiFi TC44C */ QUIRK_FLAG_DSD_RAW), DEVICE_FLG(0x2708, 0x0002, /* Audient iD14 */ QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x2912, 0x30c8, /* Audioengine D1 */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x2a70, 0x1881, /* OnePlus Technology (Shenzhen) Co., Ltd. BE02T */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE | QUIRK_FLAG_MIXER_CAPTURE_MIN_MUTE), DEVICE_FLG(0x2b53, 0x0023, /* Fiero SC-01 (firmware v1.0.0 @ 48 kHz) */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x2b53, 0x0024, /* Fiero SC-01 (firmware v1.0.0 @ 96 kHz) */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x2b53, 0x0031, /* Fiero SC-01 (firmware v1.1.0) */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x2d95, 0x8011, /* VIVO USB-C HEADSET */ QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x2d95, 0x8021, /* VIVO USB-C-XE710 HEADSET */ QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x2d99, 0x0026, /* HECATE G2 GAMING HEADSET */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x2fc6, 0xf0b7, /* iBasso DC07 Pro */ QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x30be, 0x0101, /* Schiit Hel */ QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x339b, 0x3a07, /* Synaptics HONOR USB-C HEADSET */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x413c, 0xa506, /* Dell AE515 sound bar */ QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x534d, 0x0021, /* MacroSilicon MS2100/MS2106 */ QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x534d, 0x2109, /* MacroSilicon MS2109 */ QUIRK_FLAG_ALIGN_TRANSFER), /* Vendor matches */ VENDOR_FLG(0x045e, /* MS Lifecam */ QUIRK_FLAG_GET_SAMPLE_RATE), VENDOR_FLG(0x046d, /* Logitech */ QUIRK_FLAG_CTL_MSG_DELAY_1M), VENDOR_FLG(0x047f, /* Plantronics */ QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_CTL_MSG_DELAY), VENDOR_FLG(0x0644, /* TEAC Corp. */ QUIRK_FLAG_CTL_MSG_DELAY | QUIRK_FLAG_IFACE_DELAY), VENDOR_FLG(0x07fd, /* MOTU */ QUIRK_FLAG_VALIDATE_RATES), VENDOR_FLG(0x1235, /* Focusrite Novation */ QUIRK_FLAG_VALIDATE_RATES), VENDOR_FLG(0x1511, /* AURALiC */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x152a, /* Thesycon devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x18d1, /* iBasso devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x1de7, /* Phoenix Audio */ QUIRK_FLAG_GET_SAMPLE_RATE), VENDOR_FLG(0x20b1, /* XMOS based devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x21ed, /* Accuphase Laboratory */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x22d9, /* Oppo */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x23ba, /* Playback Design */ QUIRK_FLAG_CTL_MSG_DELAY | QUIRK_FLAG_IFACE_DELAY | QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x25ce, /* Mytek devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x2622, /* IAG Limited devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x278b, /* Rotel? */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x292b, /* Gustard/Ess based devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x2972, /* FiiO devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x2ab6, /* T+A devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x2afd, /* McIntosh Laboratory, Inc. */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x2d87, /* Cayin device */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x2fc6, /* Comture-inc devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x3336, /* HEM devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x3353, /* Khadas devices */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x35f4, /* MSB Technology */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0x3842, /* EVGA */ QUIRK_FLAG_DSD_RAW), VENDOR_FLG(0xc502, /* HiBy devices */ QUIRK_FLAG_DSD_RAW), {} /* terminator */ }; #define QUIRK_STRING_ENTRY(x) \ [QUIRK_TYPE_ ## x] = __stringify(x) static const char *const snd_usb_audio_quirk_flag_names[] = { QUIRK_STRING_ENTRY(GET_SAMPLE_RATE), QUIRK_STRING_ENTRY(SHARE_MEDIA_DEVICE), QUIRK_STRING_ENTRY(ALIGN_TRANSFER), QUIRK_STRING_ENTRY(TX_LENGTH), QUIRK_STRING_ENTRY(PLAYBACK_FIRST), QUIRK_STRING_ENTRY(SKIP_CLOCK_SELECTOR), QUIRK_STRING_ENTRY(IGNORE_CLOCK_SOURCE), QUIRK_STRING_ENTRY(ITF_USB_DSD_DAC), QUIRK_STRING_ENTRY(CTL_MSG_DELAY), QUIRK_STRING_ENTRY(CTL_MSG_DELAY_1M), QUIRK_STRING_ENTRY(CTL_MSG_DELAY_5M), QUIRK_STRING_ENTRY(IFACE_DELAY), QUIRK_STRING_ENTRY(VALIDATE_RATES), QUIRK_STRING_ENTRY(DISABLE_AUTOSUSPEND), QUIRK_STRING_ENTRY(IGNORE_CTL_ERROR), QUIRK_STRING_ENTRY(DSD_RAW), QUIRK_STRING_ENTRY(SET_IFACE_FIRST), QUIRK_STRING_ENTRY(GENERIC_IMPLICIT_FB), QUIRK_STRING_ENTRY(SKIP_IMPLICIT_FB), QUIRK_STRING_ENTRY(IFACE_SKIP_CLOSE), QUIRK_STRING_ENTRY(FORCE_IFACE_RESET), QUIRK_STRING_ENTRY(FIXED_RATE), QUIRK_STRING_ENTRY(MIC_RES_16), QUIRK_STRING_ENTRY(MIC_RES_384), QUIRK_STRING_ENTRY(MIXER_PLAYBACK_MIN_MUTE), QUIRK_STRING_ENTRY(MIXER_CAPTURE_MIN_MUTE), NULL }; const char *snd_usb_quirk_flag_find_name(unsigned long index) { if (index >= ARRAY_SIZE(snd_usb_audio_quirk_flag_names)) return NULL; return snd_usb_audio_quirk_flag_names[index]; } u32 snd_usb_quirk_flags_from_name(const char *name) { int i; if (!name || !*name) return 0; for (i = 0; snd_usb_audio_quirk_flag_names[i]; i++) { if (strcasecmp(name, snd_usb_audio_quirk_flag_names[i]) == 0) return BIT_U32(i); } return 0; } void snd_usb_apply_flag_dbg(const char *reason, struct snd_usb_audio *chip, unsigned long flag) { unsigned long bit; for_each_set_bit(bit, &flag, BYTES_TO_BITS(sizeof(flag))) { const char *name = snd_usb_audio_quirk_flag_names[bit]; if (name) usb_audio_dbg(chip, "From %s apply quirk flag %s for device %04x:%04x\n", reason, name, USB_ID_VENDOR(chip->usb_id), USB_ID_PRODUCT(chip->usb_id)); else usb_audio_warn(chip, "From %s apply unknown quirk flag 0x%lx for device %04x:%04x\n", reason, bit, USB_ID_VENDOR(chip->usb_id), USB_ID_PRODUCT(chip->usb_id)); } } void snd_usb_init_quirk_flags_table(struct snd_usb_audio *chip) { const struct usb_audio_quirk_flags_table *p; for (p = quirk_flags_table; p->id; p++) { if (chip->usb_id == p->id || (!USB_ID_PRODUCT(p->id) && USB_ID_VENDOR(chip->usb_id) == USB_ID_VENDOR(p->id))) { snd_usb_apply_flag_dbg("builtin table", chip, p->flags); chip->quirk_flags |= p->flags; return; } } } void snd_usb_init_quirk_flags_parse_string(struct snd_usb_audio *chip, const char *str) { u16 chip_vid = USB_ID_VENDOR(chip->usb_id); u16 chip_pid = USB_ID_PRODUCT(chip->usb_id); u32 mask_flags, unmask_flags, bit; char *p, *field, *flag; bool is_unmask; u16 vid, pid; char *val __free(kfree) = kstrdup(str, GFP_KERNEL); if (!val) return; for (p = val; p && *p;) { /* Each entry consists of VID:PID:flags */ field = strsep(&p, ":"); if (!field) break; if (strcmp(field, "*") == 0) vid = 0; else if (kstrtou16(field, 16, &vid)) break; field = strsep(&p, ":"); if (!field) break; if (strcmp(field, "*") == 0) pid = 0; else if (kstrtou16(field, 16, &pid)) break; field = strsep(&p, ";"); if (!field || !*field) break; if ((vid != 0 && vid != chip_vid) || (pid != 0 && pid != chip_pid)) continue; /* Collect the flags */ mask_flags = 0; unmask_flags = 0; while (field && *field) { flag = strsep(&field, "|"); if (!flag) break; if (*flag == '!') { is_unmask = true; flag++; } else { is_unmask = false; } if (!kstrtou32(flag, 16, &bit)) { if (is_unmask) unmask_flags |= bit; else mask_flags |= bit; break; } bit = snd_usb_quirk_flags_from_name(flag); if (bit) { if (is_unmask) unmask_flags |= bit; else mask_flags |= bit; } else { pr_warn("snd_usb_audio: unknown flag %s while parsing param quirk_flags\n", flag); } } chip->quirk_flags &= ~unmask_flags; chip->quirk_flags |= mask_flags; snd_usb_apply_flag_dbg("module param", chip, chip->quirk_flags); } }
4 4 4 4 3 3 3 3 3 5 5 4 5 5 5 5 3 10 6 6 4 4 3 2 4 1 1 10 10 10 5 7 10 1 1 1 10 10 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/bond/bond_options.c - bonding options * Copyright (c) 2013 Jiri Pirko <jiri@resnulli.us> * Copyright (c) 2013 Scott Feldman <sfeldma@cumulusnetworks.com> */ #include <linux/errno.h> #include <linux/if.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/sched/signal.h> #include <net/bonding.h> #include <net/ndisc.h> static int bond_option_active_slave_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_miimon_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_updelay_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_downdelay_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_peer_notif_delay_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_use_carrier_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_arp_interval_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_arp_ip_target_add(struct bonding *bond, __be32 target); static int bond_option_arp_ip_target_rem(struct bonding *bond, __be32 target); static int bond_option_arp_ip_targets_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ns_ip6_targets_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_arp_validate_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_arp_all_targets_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_prio_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_primary_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_primary_reselect_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_fail_over_mac_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_xmit_hash_policy_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_resend_igmp_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_num_peer_notif_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_all_slaves_active_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_min_links_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_lp_interval_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_pps_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_lacp_active_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_lacp_rate_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_select_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_queue_id_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_mode_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_slaves_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_tlb_dynamic_lb_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_actor_port_prio_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_user_port_key_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_missed_max_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_coupled_control_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_broadcast_neigh_set(struct bonding *bond, const struct bond_opt_value *newval); static const struct bond_opt_value bond_mode_tbl[] = { { "balance-rr", BOND_MODE_ROUNDROBIN, BOND_VALFLAG_DEFAULT}, { "active-backup", BOND_MODE_ACTIVEBACKUP, 0}, { "balance-xor", BOND_MODE_XOR, 0}, { "broadcast", BOND_MODE_BROADCAST, 0}, { "802.3ad", BOND_MODE_8023AD, 0}, { "balance-tlb", BOND_MODE_TLB, 0}, { "balance-alb", BOND_MODE_ALB, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_pps_tbl[] = { { "default", 1, BOND_VALFLAG_DEFAULT}, { "maxval", USHRT_MAX, BOND_VALFLAG_MAX}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_xmit_hashtype_tbl[] = { { "layer2", BOND_XMIT_POLICY_LAYER2, BOND_VALFLAG_DEFAULT}, { "layer3+4", BOND_XMIT_POLICY_LAYER34, 0}, { "layer2+3", BOND_XMIT_POLICY_LAYER23, 0}, { "encap2+3", BOND_XMIT_POLICY_ENCAP23, 0}, { "encap3+4", BOND_XMIT_POLICY_ENCAP34, 0}, { "vlan+srcmac", BOND_XMIT_POLICY_VLAN_SRCMAC, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_arp_validate_tbl[] = { { "none", BOND_ARP_VALIDATE_NONE, BOND_VALFLAG_DEFAULT}, { "active", BOND_ARP_VALIDATE_ACTIVE, 0}, { "backup", BOND_ARP_VALIDATE_BACKUP, 0}, { "all", BOND_ARP_VALIDATE_ALL, 0}, { "filter", BOND_ARP_FILTER, 0}, { "filter_active", BOND_ARP_FILTER_ACTIVE, 0}, { "filter_backup", BOND_ARP_FILTER_BACKUP, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_arp_all_targets_tbl[] = { { "any", BOND_ARP_TARGETS_ANY, BOND_VALFLAG_DEFAULT}, { "all", BOND_ARP_TARGETS_ALL, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_fail_over_mac_tbl[] = { { "none", BOND_FOM_NONE, BOND_VALFLAG_DEFAULT}, { "active", BOND_FOM_ACTIVE, 0}, { "follow", BOND_FOM_FOLLOW, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_intmax_tbl[] = { { "off", 0, BOND_VALFLAG_DEFAULT}, { "maxval", INT_MAX, BOND_VALFLAG_MAX}, { NULL, -1, 0} }; static const struct bond_opt_value bond_lacp_active[] = { { "off", 0, 0}, { "on", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; static const struct bond_opt_value bond_lacp_rate_tbl[] = { { "slow", AD_LACP_SLOW, 0}, { "fast", AD_LACP_FAST, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_ad_select_tbl[] = { { "stable", BOND_AD_STABLE, BOND_VALFLAG_DEFAULT}, { "bandwidth", BOND_AD_BANDWIDTH, 0}, { "count", BOND_AD_COUNT, 0}, { "actor_port_prio", BOND_AD_PRIO, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_num_peer_notif_tbl[] = { { "off", 0, 0}, { "maxval", 255, BOND_VALFLAG_MAX}, { "default", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; static const struct bond_opt_value bond_peer_notif_delay_tbl[] = { { "off", 0, 0}, { "maxval", 300000, BOND_VALFLAG_MAX}, { NULL, -1, 0} }; static const struct bond_opt_value bond_primary_reselect_tbl[] = { { "always", BOND_PRI_RESELECT_ALWAYS, BOND_VALFLAG_DEFAULT}, { "better", BOND_PRI_RESELECT_BETTER, 0}, { "failure", BOND_PRI_RESELECT_FAILURE, 0}, { NULL, -1}, }; static const struct bond_opt_value bond_use_carrier_tbl[] = { { "on", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; static const struct bond_opt_value bond_all_slaves_active_tbl[] = { { "off", 0, BOND_VALFLAG_DEFAULT}, { "on", 1, 0}, { NULL, -1, 0} }; static const struct bond_opt_value bond_resend_igmp_tbl[] = { { "off", 0, 0}, { "maxval", 255, BOND_VALFLAG_MAX}, { "default", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; static const struct bond_opt_value bond_lp_interval_tbl[] = { { "minval", 1, BOND_VALFLAG_MIN | BOND_VALFLAG_DEFAULT}, { "maxval", INT_MAX, BOND_VALFLAG_MAX}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_tlb_dynamic_lb_tbl[] = { { "off", 0, 0}, { "on", 1, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0} }; static const struct bond_opt_value bond_ad_actor_sys_prio_tbl[] = { { "minval", 1, BOND_VALFLAG_MIN}, { "maxval", 65535, BOND_VALFLAG_MAX | BOND_VALFLAG_DEFAULT}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_ad_user_port_key_tbl[] = { { "minval", 0, BOND_VALFLAG_MIN | BOND_VALFLAG_DEFAULT}, { "maxval", 1023, BOND_VALFLAG_MAX}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_missed_max_tbl[] = { { "minval", 1, BOND_VALFLAG_MIN}, { "maxval", 255, BOND_VALFLAG_MAX}, { "default", 2, BOND_VALFLAG_DEFAULT}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_coupled_control_tbl[] = { { "on", 1, BOND_VALFLAG_DEFAULT}, { "off", 0, 0}, { NULL, -1, 0}, }; static const struct bond_opt_value bond_broadcast_neigh_tbl[] = { { "off", 0, BOND_VALFLAG_DEFAULT}, { "on", 1, 0}, { NULL, -1, 0} }; static const struct bond_option bond_opts[BOND_OPT_LAST] = { [BOND_OPT_MODE] = { .id = BOND_OPT_MODE, .name = "mode", .desc = "bond device mode", .flags = BOND_OPTFLAG_NOSLAVES | BOND_OPTFLAG_IFDOWN, .values = bond_mode_tbl, .set = bond_option_mode_set }, [BOND_OPT_PACKETS_PER_SLAVE] = { .id = BOND_OPT_PACKETS_PER_SLAVE, .name = "packets_per_slave", .desc = "Packets to send per slave in RR mode", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ROUNDROBIN)), .values = bond_pps_tbl, .set = bond_option_pps_set }, [BOND_OPT_XMIT_HASH] = { .id = BOND_OPT_XMIT_HASH, .name = "xmit_hash_policy", .desc = "balance-xor, 802.3ad, and tlb hashing method", .values = bond_xmit_hashtype_tbl, .set = bond_option_xmit_hash_policy_set }, [BOND_OPT_ARP_VALIDATE] = { .id = BOND_OPT_ARP_VALIDATE, .name = "arp_validate", .desc = "validate src/dst of ARP probes", .unsuppmodes = BIT(BOND_MODE_8023AD) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB), .values = bond_arp_validate_tbl, .set = bond_option_arp_validate_set }, [BOND_OPT_ARP_ALL_TARGETS] = { .id = BOND_OPT_ARP_ALL_TARGETS, .name = "arp_all_targets", .desc = "fail on any/all arp targets timeout", .values = bond_arp_all_targets_tbl, .set = bond_option_arp_all_targets_set }, [BOND_OPT_FAIL_OVER_MAC] = { .id = BOND_OPT_FAIL_OVER_MAC, .name = "fail_over_mac", .desc = "For active-backup, do not set all slaves to the same MAC", .flags = BOND_OPTFLAG_NOSLAVES, .values = bond_fail_over_mac_tbl, .set = bond_option_fail_over_mac_set }, [BOND_OPT_ARP_INTERVAL] = { .id = BOND_OPT_ARP_INTERVAL, .name = "arp_interval", .desc = "arp interval in milliseconds", .unsuppmodes = BIT(BOND_MODE_8023AD) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB), .values = bond_intmax_tbl, .set = bond_option_arp_interval_set }, [BOND_OPT_MISSED_MAX] = { .id = BOND_OPT_MISSED_MAX, .name = "arp_missed_max", .desc = "Maximum number of missed ARP interval", .unsuppmodes = BIT(BOND_MODE_8023AD) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB), .values = bond_missed_max_tbl, .set = bond_option_missed_max_set }, [BOND_OPT_ARP_TARGETS] = { .id = BOND_OPT_ARP_TARGETS, .name = "arp_ip_target", .desc = "arp targets in n.n.n.n form", .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_arp_ip_targets_set }, [BOND_OPT_NS_TARGETS] = { .id = BOND_OPT_NS_TARGETS, .name = "ns_ip6_target", .desc = "NS targets in ffff:ffff::ffff:ffff form", .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_ns_ip6_targets_set }, [BOND_OPT_DOWNDELAY] = { .id = BOND_OPT_DOWNDELAY, .name = "downdelay", .desc = "Delay before considering link down, in milliseconds", .values = bond_intmax_tbl, .set = bond_option_downdelay_set }, [BOND_OPT_UPDELAY] = { .id = BOND_OPT_UPDELAY, .name = "updelay", .desc = "Delay before considering link up, in milliseconds", .values = bond_intmax_tbl, .set = bond_option_updelay_set }, [BOND_OPT_LACP_ACTIVE] = { .id = BOND_OPT_LACP_ACTIVE, .name = "lacp_active", .desc = "Send LACPDU frames with configured lacp rate or acts as speak when spoken to", .flags = BOND_OPTFLAG_IFDOWN, .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .values = bond_lacp_active, .set = bond_option_lacp_active_set }, [BOND_OPT_LACP_RATE] = { .id = BOND_OPT_LACP_RATE, .name = "lacp_rate", .desc = "LACPDU tx rate to request from 802.3ad partner", .flags = BOND_OPTFLAG_IFDOWN, .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .values = bond_lacp_rate_tbl, .set = bond_option_lacp_rate_set }, [BOND_OPT_MINLINKS] = { .id = BOND_OPT_MINLINKS, .name = "min_links", .desc = "Minimum number of available links before turning on carrier", .values = bond_intmax_tbl, .set = bond_option_min_links_set }, [BOND_OPT_AD_SELECT] = { .id = BOND_OPT_AD_SELECT, .name = "ad_select", .desc = "802.3ad aggregation selection logic", .flags = BOND_OPTFLAG_IFDOWN, .values = bond_ad_select_tbl, .set = bond_option_ad_select_set }, [BOND_OPT_NUM_PEER_NOTIF] = { .id = BOND_OPT_NUM_PEER_NOTIF, .name = "num_unsol_na", .desc = "Number of peer notifications to send on failover event", .values = bond_num_peer_notif_tbl, .set = bond_option_num_peer_notif_set }, [BOND_OPT_MIIMON] = { .id = BOND_OPT_MIIMON, .name = "miimon", .desc = "Link check interval in milliseconds", .values = bond_intmax_tbl, .set = bond_option_miimon_set }, [BOND_OPT_PRIO] = { .id = BOND_OPT_PRIO, .name = "prio", .desc = "Link priority for failover re-selection", .flags = BOND_OPTFLAG_RAWVAL, .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ACTIVEBACKUP) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)), .set = bond_option_prio_set }, [BOND_OPT_PRIMARY] = { .id = BOND_OPT_PRIMARY, .name = "primary", .desc = "Primary network device to use", .flags = BOND_OPTFLAG_RAWVAL, .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ACTIVEBACKUP) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)), .set = bond_option_primary_set }, [BOND_OPT_PRIMARY_RESELECT] = { .id = BOND_OPT_PRIMARY_RESELECT, .name = "primary_reselect", .desc = "Reselect primary slave once it comes up", .values = bond_primary_reselect_tbl, .set = bond_option_primary_reselect_set }, [BOND_OPT_USE_CARRIER] = { .id = BOND_OPT_USE_CARRIER, .name = "use_carrier", .desc = "option obsolete, use_carrier cannot be disabled", .values = bond_use_carrier_tbl, .set = bond_option_use_carrier_set }, [BOND_OPT_ACTIVE_SLAVE] = { .id = BOND_OPT_ACTIVE_SLAVE, .name = "active_slave", .desc = "Currently active slave", .flags = BOND_OPTFLAG_RAWVAL, .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_ACTIVEBACKUP) | BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)), .set = bond_option_active_slave_set }, [BOND_OPT_QUEUE_ID] = { .id = BOND_OPT_QUEUE_ID, .name = "queue_id", .desc = "Set queue id of a slave", .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_queue_id_set }, [BOND_OPT_ALL_SLAVES_ACTIVE] = { .id = BOND_OPT_ALL_SLAVES_ACTIVE, .name = "all_slaves_active", .desc = "Keep all frames received on an interface by setting active flag for all slaves", .values = bond_all_slaves_active_tbl, .set = bond_option_all_slaves_active_set }, [BOND_OPT_RESEND_IGMP] = { .id = BOND_OPT_RESEND_IGMP, .name = "resend_igmp", .desc = "Number of IGMP membership reports to send on link failure", .values = bond_resend_igmp_tbl, .set = bond_option_resend_igmp_set }, [BOND_OPT_LP_INTERVAL] = { .id = BOND_OPT_LP_INTERVAL, .name = "lp_interval", .desc = "The number of seconds between instances where the bonding driver sends learning packets to each slave's peer switch", .values = bond_lp_interval_tbl, .set = bond_option_lp_interval_set }, [BOND_OPT_SLAVES] = { .id = BOND_OPT_SLAVES, .name = "slaves", .desc = "Slave membership management", .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_slaves_set }, [BOND_OPT_TLB_DYNAMIC_LB] = { .id = BOND_OPT_TLB_DYNAMIC_LB, .name = "tlb_dynamic_lb", .desc = "Enable dynamic flow shuffling", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_TLB) | BIT(BOND_MODE_ALB)), .values = bond_tlb_dynamic_lb_tbl, .flags = BOND_OPTFLAG_IFDOWN, .set = bond_option_tlb_dynamic_lb_set, }, [BOND_OPT_AD_ACTOR_SYS_PRIO] = { .id = BOND_OPT_AD_ACTOR_SYS_PRIO, .name = "ad_actor_sys_prio", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .values = bond_ad_actor_sys_prio_tbl, .set = bond_option_ad_actor_sys_prio_set, }, [BOND_OPT_ACTOR_PORT_PRIO] = { .id = BOND_OPT_ACTOR_PORT_PRIO, .name = "actor_port_prio", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_actor_port_prio_set, }, [BOND_OPT_AD_ACTOR_SYSTEM] = { .id = BOND_OPT_AD_ACTOR_SYSTEM, .name = "ad_actor_system", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .flags = BOND_OPTFLAG_RAWVAL, .set = bond_option_ad_actor_system_set, }, [BOND_OPT_AD_USER_PORT_KEY] = { .id = BOND_OPT_AD_USER_PORT_KEY, .name = "ad_user_port_key", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .flags = BOND_OPTFLAG_IFDOWN, .values = bond_ad_user_port_key_tbl, .set = bond_option_ad_user_port_key_set, }, [BOND_OPT_NUM_PEER_NOTIF_ALIAS] = { .id = BOND_OPT_NUM_PEER_NOTIF_ALIAS, .name = "num_grat_arp", .desc = "Number of peer notifications to send on failover event", .values = bond_num_peer_notif_tbl, .set = bond_option_num_peer_notif_set }, [BOND_OPT_PEER_NOTIF_DELAY] = { .id = BOND_OPT_PEER_NOTIF_DELAY, .name = "peer_notif_delay", .desc = "Delay between each peer notification on failover event, in milliseconds", .values = bond_peer_notif_delay_tbl, .set = bond_option_peer_notif_delay_set }, [BOND_OPT_COUPLED_CONTROL] = { .id = BOND_OPT_COUPLED_CONTROL, .name = "coupled_control", .desc = "Opt into using coupled control MUX for LACP states", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .flags = BOND_OPTFLAG_IFDOWN, .values = bond_coupled_control_tbl, .set = bond_option_coupled_control_set, }, [BOND_OPT_BROADCAST_NEIGH] = { .id = BOND_OPT_BROADCAST_NEIGH, .name = "broadcast_neighbor", .desc = "Broadcast neighbor packets to all active slaves", .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), .values = bond_broadcast_neigh_tbl, .set = bond_option_broadcast_neigh_set, } }; /* Searches for an option by name */ const struct bond_option *bond_opt_get_by_name(const char *name) { const struct bond_option *opt; int option; for (option = 0; option < BOND_OPT_LAST; option++) { opt = bond_opt_get(option); if (opt && !strcmp(opt->name, name)) return opt; } return NULL; } /* Searches for a value in opt's values[] table */ const struct bond_opt_value *bond_opt_get_val(unsigned int option, u64 val) { const struct bond_option *opt; int i; opt = bond_opt_get(option); if (WARN_ON(!opt)) return NULL; for (i = 0; opt->values && opt->values[i].string; i++) if (opt->values[i].value == val) return &opt->values[i]; return NULL; } /* Searches for a value in opt's values[] table which matches the flagmask */ static const struct bond_opt_value *bond_opt_get_flags(const struct bond_option *opt, u32 flagmask) { int i; for (i = 0; opt->values && opt->values[i].string; i++) if (opt->values[i].flags & flagmask) return &opt->values[i]; return NULL; } /* If maxval is missing then there's no range to check. In case minval is * missing then it's considered to be 0. */ static bool bond_opt_check_range(const struct bond_option *opt, u64 val) { const struct bond_opt_value *minval, *maxval; minval = bond_opt_get_flags(opt, BOND_VALFLAG_MIN); maxval = bond_opt_get_flags(opt, BOND_VALFLAG_MAX); if (!maxval || (minval && val < minval->value) || val > maxval->value) return false; return true; } /** * bond_opt_parse - parse option value * @opt: the option to parse against * @val: value to parse * * This function tries to extract the value from @val and check if it's * a possible match for the option and returns NULL if a match isn't found, * or the struct_opt_value that matched. It also strips the new line from * @val->string if it's present. */ const struct bond_opt_value *bond_opt_parse(const struct bond_option *opt, struct bond_opt_value *val) { char *p, valstr[BOND_OPT_MAX_NAMELEN + 1] = { 0, }; const struct bond_opt_value *tbl; const struct bond_opt_value *ret = NULL; bool checkval; int i, rv; /* No parsing if the option wants a raw val */ if (opt->flags & BOND_OPTFLAG_RAWVAL) return val; tbl = opt->values; if (!tbl) goto out; /* ULLONG_MAX is used to bypass string processing */ checkval = val->value != ULLONG_MAX; if (!checkval) { if (!val->string) goto out; p = strchr(val->string, '\n'); if (p) *p = '\0'; for (p = val->string; *p; p++) if (!(isdigit(*p) || isspace(*p))) break; /* The following code extracts the string to match or the value * and sets checkval appropriately */ if (*p) { rv = sscanf(val->string, "%32s", valstr); } else { rv = sscanf(val->string, "%llu", &val->value); checkval = true; } if (!rv) goto out; } for (i = 0; tbl[i].string; i++) { /* Check for exact match */ if (checkval) { if (val->value == tbl[i].value) ret = &tbl[i]; } else { if (!strcmp(valstr, "default") && (tbl[i].flags & BOND_VALFLAG_DEFAULT)) ret = &tbl[i]; if (!strcmp(valstr, tbl[i].string)) ret = &tbl[i]; } /* Found an exact match */ if (ret) goto out; } /* Possible range match */ if (checkval && bond_opt_check_range(opt, val->value)) ret = val; out: return ret; } /* Check opt's dependencies against bond mode and currently set options */ static int bond_opt_check_deps(struct bonding *bond, const struct bond_option *opt) { struct bond_params *params = &bond->params; if (test_bit(params->mode, &opt->unsuppmodes)) return -EACCES; if ((opt->flags & BOND_OPTFLAG_NOSLAVES) && bond_has_slaves(bond)) return -ENOTEMPTY; if ((opt->flags & BOND_OPTFLAG_IFDOWN) && (bond->dev->flags & IFF_UP)) return -EBUSY; return 0; } static void bond_opt_dep_print(struct bonding *bond, const struct bond_option *opt, struct nlattr *bad_attr, struct netlink_ext_ack *extack) { const struct bond_opt_value *modeval; struct bond_params *params; params = &bond->params; modeval = bond_opt_get_val(BOND_OPT_MODE, params->mode); if (test_bit(params->mode, &opt->unsuppmodes)) { netdev_err(bond->dev, "option %s: mode dependency failed, not supported in mode %s(%llu)\n", opt->name, modeval->string, modeval->value); NL_SET_ERR_MSG_ATTR(extack, bad_attr, "option not supported in mode"); } } static void bond_opt_error_interpret(struct bonding *bond, const struct bond_option *opt, int error, const struct bond_opt_value *val, struct nlattr *bad_attr, struct netlink_ext_ack *extack) { const struct bond_opt_value *minval, *maxval; char *p; switch (error) { case -EINVAL: NL_SET_ERR_MSG_ATTR(extack, bad_attr, "invalid option value"); if (val) { if (val->string) { /* sometimes RAWVAL opts may have new lines */ p = strchr(val->string, '\n'); if (p) *p = '\0'; netdev_err(bond->dev, "option %s: invalid value (%s)\n", opt->name, val->string); } else { netdev_err(bond->dev, "option %s: invalid value (%llu)\n", opt->name, val->value); } } minval = bond_opt_get_flags(opt, BOND_VALFLAG_MIN); maxval = bond_opt_get_flags(opt, BOND_VALFLAG_MAX); if (!maxval) break; netdev_err(bond->dev, "option %s: allowed values %llu - %llu\n", opt->name, minval ? minval->value : 0, maxval->value); break; case -EACCES: bond_opt_dep_print(bond, opt, bad_attr, extack); break; case -ENOTEMPTY: NL_SET_ERR_MSG_ATTR(extack, bad_attr, "unable to set option because the bond device has slaves"); netdev_err(bond->dev, "option %s: unable to set because the bond device has slaves\n", opt->name); break; case -EBUSY: NL_SET_ERR_MSG_ATTR(extack, bad_attr, "unable to set option because the bond is up"); netdev_err(bond->dev, "option %s: unable to set because the bond device is up\n", opt->name); break; case -ENODEV: if (val && val->string) { p = strchr(val->string, '\n'); if (p) *p = '\0'; netdev_err(bond->dev, "option %s: interface %s does not exist!\n", opt->name, val->string); NL_SET_ERR_MSG_ATTR(extack, bad_attr, "interface does not exist"); } break; default: break; } } /** * __bond_opt_set - set a bonding option * @bond: target bond device * @option: option to set * @val: value to set it to * @bad_attr: netlink attribue that caused the error * @extack: extended netlink error structure, used when an error message * needs to be returned to the caller via netlink * * This function is used to change the bond's option value, it can be * used for both enabling/changing an option and for disabling it. RTNL lock * must be obtained before calling this function. */ int __bond_opt_set(struct bonding *bond, unsigned int option, struct bond_opt_value *val, struct nlattr *bad_attr, struct netlink_ext_ack *extack) { const struct bond_opt_value *retval = NULL; const struct bond_option *opt; int ret = -ENOENT; ASSERT_RTNL(); opt = bond_opt_get(option); if (WARN_ON(!val) || WARN_ON(!opt)) goto out; ret = bond_opt_check_deps(bond, opt); if (ret) goto out; retval = bond_opt_parse(opt, val); if (!retval) { ret = -EINVAL; goto out; } ret = opt->set(bond, retval); out: if (ret) bond_opt_error_interpret(bond, opt, ret, val, bad_attr, extack); return ret; } /** * __bond_opt_set_notify - set a bonding option * @bond: target bond device * @option: option to set * @val: value to set it to * * This function is used to change the bond's option value and trigger * a notification to user sapce. It can be used for both enabling/changing * an option and for disabling it. RTNL lock must be obtained before calling * this function. */ int __bond_opt_set_notify(struct bonding *bond, unsigned int option, struct bond_opt_value *val) { int ret; ASSERT_RTNL(); ret = __bond_opt_set(bond, option, val, NULL, NULL); if (!ret && (bond->dev->reg_state == NETREG_REGISTERED)) call_netdevice_notifiers(NETDEV_CHANGEINFODATA, bond->dev); return ret; } /** * bond_opt_tryset_rtnl - try to acquire rtnl and call __bond_opt_set * @bond: target bond device * @option: option to set * @buf: value to set it to * * This function tries to acquire RTNL without blocking and if successful * calls __bond_opt_set. It is mainly used for sysfs option manipulation. */ int bond_opt_tryset_rtnl(struct bonding *bond, unsigned int option, char *buf) { struct bond_opt_value optval; int ret; if (!rtnl_trylock()) return restart_syscall(); bond_opt_initstr(&optval, buf); ret = __bond_opt_set_notify(bond, option, &optval); rtnl_unlock(); return ret; } /** * bond_opt_get - get a pointer to an option * @option: option for which to return a pointer * * This function checks if option is valid and if so returns a pointer * to its entry in the bond_opts[] option array. */ const struct bond_option *bond_opt_get(unsigned int option) { if (!BOND_OPT_VALID(option)) return NULL; return &bond_opts[option]; } static bool bond_set_xfrm_features(struct bonding *bond) { if (!IS_ENABLED(CONFIG_XFRM_OFFLOAD)) return false; if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond->dev->wanted_features |= BOND_XFRM_FEATURES; else bond->dev->wanted_features &= ~BOND_XFRM_FEATURES; return true; } static int bond_option_mode_set(struct bonding *bond, const struct bond_opt_value *newval) { if (bond->xdp_prog && !bond_xdp_check(bond, newval->value)) return -EOPNOTSUPP; if (!bond_mode_uses_arp(newval->value)) { if (bond->params.arp_interval) { netdev_dbg(bond->dev, "%s mode is incompatible with arp monitoring, start mii monitoring\n", newval->string); /* disable arp monitoring */ bond->params.arp_interval = 0; } if (!bond->params.miimon) { /* set miimon to default value */ bond->params.miimon = BOND_DEFAULT_MIIMON; netdev_dbg(bond->dev, "Setting MII monitoring interval to %d\n", bond->params.miimon); } } if (newval->value == BOND_MODE_ALB) bond->params.tlb_dynamic_lb = 1; /* don't cache arp_validate between modes */ bond->params.arp_validate = BOND_ARP_VALIDATE_NONE; bond->params.mode = newval->value; /* When changing mode, the bond device is down, we may reduce * the bond_bcast_neigh_enabled in bond_close() if broadcast_neighbor * enabled in 8023ad mode. Therefore, only clear broadcast_neighbor * to 0. */ bond->params.broadcast_neighbor = 0; if (bond->dev->reg_state == NETREG_REGISTERED) { bool update = false; update |= bond_set_xfrm_features(bond); if (update) netdev_update_features(bond->dev); } bond_xdp_set_features(bond->dev); return 0; } static int bond_option_active_slave_set(struct bonding *bond, const struct bond_opt_value *newval) { char ifname[IFNAMSIZ] = { 0, }; struct net_device *slave_dev; int ret = 0; sscanf(newval->string, "%15s", ifname); /* IFNAMSIZ */ if (!strlen(ifname) || newval->string[0] == '\n') { slave_dev = NULL; } else { slave_dev = __dev_get_by_name(dev_net(bond->dev), ifname); if (!slave_dev) return -ENODEV; } if (slave_dev) { if (!netif_is_bond_slave(slave_dev)) { slave_err(bond->dev, slave_dev, "Device is not bonding slave\n"); return -EINVAL; } if (bond->dev != netdev_master_upper_dev_get(slave_dev)) { slave_err(bond->dev, slave_dev, "Device is not our slave\n"); return -EINVAL; } } block_netpoll_tx(); /* check to see if we are clearing active */ if (!slave_dev) { netdev_dbg(bond->dev, "Clearing current active slave\n"); bond_change_active_slave(bond, NULL); bond_select_active_slave(bond); } else { struct slave *old_active = rtnl_dereference(bond->curr_active_slave); struct slave *new_active = bond_slave_get_rtnl(slave_dev); BUG_ON(!new_active); if (new_active == old_active) { /* do nothing */ slave_dbg(bond->dev, new_active->dev, "is already the current active slave\n"); } else { if (old_active && (new_active->link == BOND_LINK_UP) && bond_slave_is_up(new_active)) { slave_dbg(bond->dev, new_active->dev, "Setting as active slave\n"); bond_change_active_slave(bond, new_active); } else { slave_err(bond->dev, new_active->dev, "Could not set as active slave; either %s is down or the link is down\n", new_active->dev->name); ret = -EINVAL; } } } unblock_netpoll_tx(); return ret; } /* There are two tricky bits here. First, if MII monitoring is activated, then * we must disable ARP monitoring. Second, if the timer isn't running, we must * start it. */ static int bond_option_miimon_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting MII monitoring interval to %llu\n", newval->value); bond->params.miimon = newval->value; if (bond->params.updelay) netdev_dbg(bond->dev, "Note: Updating updelay (to %d) since it is a multiple of the miimon value\n", bond->params.updelay * bond->params.miimon); if (bond->params.downdelay) netdev_dbg(bond->dev, "Note: Updating downdelay (to %d) since it is a multiple of the miimon value\n", bond->params.downdelay * bond->params.miimon); if (bond->params.peer_notif_delay) netdev_dbg(bond->dev, "Note: Updating peer_notif_delay (to %d) since it is a multiple of the miimon value\n", bond->params.peer_notif_delay * bond->params.miimon); if (newval->value && bond->params.arp_interval) { netdev_dbg(bond->dev, "MII monitoring cannot be used with ARP monitoring - disabling ARP monitoring...\n"); bond->params.arp_interval = 0; if (bond->params.arp_validate) bond->params.arp_validate = BOND_ARP_VALIDATE_NONE; } if (bond->dev->flags & IFF_UP) { /* If the interface is up, we may need to fire off * the MII timer. If the interface is down, the * timer will get fired off when the open function * is called. */ if (!newval->value) { cancel_delayed_work_sync(&bond->mii_work); } else { cancel_delayed_work_sync(&bond->arp_work); queue_delayed_work(bond->wq, &bond->mii_work, 0); } } return 0; } /* Set up, down and peer notification delays. These must be multiples * of the MII monitoring value, and are stored internally as the * multiplier. Thus, we must translate to MS for the real world. */ static int _bond_option_delay_set(struct bonding *bond, const struct bond_opt_value *newval, const char *name, int *target) { int value = newval->value; if (!bond->params.miimon) { netdev_err(bond->dev, "Unable to set %s as MII monitoring is disabled\n", name); return -EPERM; } if ((value % bond->params.miimon) != 0) { netdev_warn(bond->dev, "%s (%d) is not a multiple of miimon (%d), value rounded to %d ms\n", name, value, bond->params.miimon, (value / bond->params.miimon) * bond->params.miimon); } *target = value / bond->params.miimon; netdev_dbg(bond->dev, "Setting %s to %d\n", name, *target * bond->params.miimon); return 0; } static int bond_option_updelay_set(struct bonding *bond, const struct bond_opt_value *newval) { return _bond_option_delay_set(bond, newval, "up delay", &bond->params.updelay); } static int bond_option_downdelay_set(struct bonding *bond, const struct bond_opt_value *newval) { return _bond_option_delay_set(bond, newval, "down delay", &bond->params.downdelay); } static int bond_option_peer_notif_delay_set(struct bonding *bond, const struct bond_opt_value *newval) { int ret = _bond_option_delay_set(bond, newval, "peer notification delay", &bond->params.peer_notif_delay); return ret; } static int bond_option_use_carrier_set(struct bonding *bond, const struct bond_opt_value *newval) { return 0; } /* There are two tricky bits here. First, if ARP monitoring is activated, then * we must disable MII monitoring. Second, if the ARP timer isn't running, * we must start it. */ static int bond_option_arp_interval_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting ARP monitoring interval to %llu\n", newval->value); bond->params.arp_interval = newval->value; if (newval->value) { if (bond->params.miimon) { netdev_dbg(bond->dev, "ARP monitoring cannot be used with MII monitoring. Disabling MII monitoring\n"); bond->params.miimon = 0; } if (!bond->params.arp_targets[0]) netdev_dbg(bond->dev, "ARP monitoring has been set up, but no ARP targets have been specified\n"); } if (bond->dev->flags & IFF_UP) { /* If the interface is up, we may need to fire off * the ARP timer. If the interface is down, the * timer will get fired off when the open function * is called. */ if (!newval->value) { if (bond->params.arp_validate) bond->recv_probe = NULL; cancel_delayed_work_sync(&bond->arp_work); } else { /* arp_validate can be set only in active-backup mode */ bond->recv_probe = bond_rcv_validate; cancel_delayed_work_sync(&bond->mii_work); queue_delayed_work(bond->wq, &bond->arp_work, 0); } } return 0; } static void _bond_options_arp_ip_target_set(struct bonding *bond, int slot, __be32 target, unsigned long last_rx) { __be32 *targets = bond->params.arp_targets; struct list_head *iter; struct slave *slave; if (slot >= 0 && slot < BOND_MAX_ARP_TARGETS) { bond_for_each_slave(bond, slave, iter) slave->target_last_arp_rx[slot] = last_rx; targets[slot] = target; } } static int _bond_option_arp_ip_target_add(struct bonding *bond, __be32 target) { __be32 *targets = bond->params.arp_targets; int ind; if (!bond_is_ip_target_ok(target)) { netdev_err(bond->dev, "invalid ARP target %pI4 specified for addition\n", &target); return -EINVAL; } if (bond_get_targets_ip(targets, target) != -1) { /* dup */ netdev_err(bond->dev, "ARP target %pI4 is already present\n", &target); return -EINVAL; } ind = bond_get_targets_ip(targets, 0); /* first free slot */ if (ind == -1) { netdev_err(bond->dev, "ARP target table is full!\n"); return -EINVAL; } netdev_dbg(bond->dev, "Adding ARP target %pI4\n", &target); _bond_options_arp_ip_target_set(bond, ind, target, jiffies); return 0; } static int bond_option_arp_ip_target_add(struct bonding *bond, __be32 target) { return _bond_option_arp_ip_target_add(bond, target); } static int bond_option_arp_ip_target_rem(struct bonding *bond, __be32 target) { __be32 *targets = bond->params.arp_targets; struct list_head *iter; struct slave *slave; unsigned long *targets_rx; int ind, i; if (!bond_is_ip_target_ok(target)) { netdev_err(bond->dev, "invalid ARP target %pI4 specified for removal\n", &target); return -EINVAL; } ind = bond_get_targets_ip(targets, target); if (ind == -1) { netdev_err(bond->dev, "unable to remove nonexistent ARP target %pI4\n", &target); return -EINVAL; } if (ind == 0 && !targets[1] && bond->params.arp_interval) netdev_warn(bond->dev, "Removing last arp target with arp_interval on\n"); netdev_dbg(bond->dev, "Removing ARP target %pI4\n", &target); bond_for_each_slave(bond, slave, iter) { targets_rx = slave->target_last_arp_rx; for (i = ind; (i < BOND_MAX_ARP_TARGETS-1) && targets[i+1]; i++) targets_rx[i] = targets_rx[i+1]; targets_rx[i] = 0; } for (i = ind; (i < BOND_MAX_ARP_TARGETS-1) && targets[i+1]; i++) targets[i] = targets[i+1]; targets[i] = 0; return 0; } void bond_option_arp_ip_targets_clear(struct bonding *bond) { int i; for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) _bond_options_arp_ip_target_set(bond, i, 0, 0); } static int bond_option_arp_ip_targets_set(struct bonding *bond, const struct bond_opt_value *newval) { int ret = -EPERM; __be32 target; if (newval->string) { if (strlen(newval->string) < 1 || !in4_pton(newval->string + 1, -1, (u8 *)&target, -1, NULL)) { netdev_err(bond->dev, "invalid ARP target specified\n"); return ret; } if (newval->string[0] == '+') ret = bond_option_arp_ip_target_add(bond, target); else if (newval->string[0] == '-') ret = bond_option_arp_ip_target_rem(bond, target); else netdev_err(bond->dev, "no command found in arp_ip_targets file - use +<addr> or -<addr>\n"); } else { target = newval->value; ret = bond_option_arp_ip_target_add(bond, target); } return ret; } #if IS_ENABLED(CONFIG_IPV6) static bool slave_can_set_ns_maddr(const struct bonding *bond, struct slave *slave) { return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && !bond_is_active_slave(slave) && slave->dev->flags & IFF_MULTICAST; } /** * slave_set_ns_maddrs - add/del all NS mac addresses for slave * @bond: bond device * @slave: slave device * @add: add or remove all the NS mac addresses * * This function tries to add or delete all the NS mac addresses on the slave * * Note, the IPv6 NS target address is the unicast address in Neighbor * Solicitation (NS) message. The dest address of NS message should be * solicited-node multicast address of the target. The dest mac of NS message * is converted from the solicited-node multicast address. * * This function is called when * * arp_validate changes * * enslaving, releasing new slaves */ static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) { struct in6_addr *targets = bond->params.ns_targets; char slot_maddr[MAX_ADDR_LEN]; struct in6_addr mcaddr; int i; if (!slave_can_set_ns_maddr(bond, slave)) return; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { if (ipv6_addr_any(&targets[i])) break; addrconf_addr_solict_mult(&targets[i], &mcaddr); if (!ndisc_mc_map(&mcaddr, slot_maddr, slave->dev, 0)) { if (add) dev_mc_add(slave->dev, slot_maddr); else dev_mc_del(slave->dev, slot_maddr); } } } void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) { if (!bond->params.arp_validate) return; slave_set_ns_maddrs(bond, slave, true); } void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) { if (!bond->params.arp_validate) return; slave_set_ns_maddrs(bond, slave, false); } /** * slave_set_ns_maddr - set new NS mac address for slave * @bond: bond device * @slave: slave device * @target: the new IPv6 target * @slot: the old IPv6 target in the slot * * This function tries to replace the old mac address to new one on the slave. * * Note, the target/slot IPv6 address is the unicast address in Neighbor * Solicitation (NS) message. The dest address of NS message should be * solicited-node multicast address of the target. The dest mac of NS message * is converted from the solicited-node multicast address. * * This function is called when * * An IPv6 NS target is added or removed. */ static void slave_set_ns_maddr(struct bonding *bond, struct slave *slave, struct in6_addr *target, struct in6_addr *slot) { char mac_addr[MAX_ADDR_LEN]; struct in6_addr mcast_addr; if (!bond->params.arp_validate || !slave_can_set_ns_maddr(bond, slave)) return; /* remove the previous mac addr from slave */ addrconf_addr_solict_mult(slot, &mcast_addr); if (!ipv6_addr_any(slot) && !ndisc_mc_map(&mcast_addr, mac_addr, slave->dev, 0)) dev_mc_del(slave->dev, mac_addr); /* add new mac addr on slave if target is set */ addrconf_addr_solict_mult(target, &mcast_addr); if (!ipv6_addr_any(target) && !ndisc_mc_map(&mcast_addr, mac_addr, slave->dev, 0)) dev_mc_add(slave->dev, mac_addr); } static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot, struct in6_addr *target, unsigned long last_rx) { struct in6_addr *targets = bond->params.ns_targets; struct list_head *iter; struct slave *slave; if (slot >= 0 && slot < BOND_MAX_NS_TARGETS) { bond_for_each_slave(bond, slave, iter) { slave->target_last_arp_rx[slot] = last_rx; slave_set_ns_maddr(bond, slave, target, &targets[slot]); } targets[slot] = *target; } } void bond_option_ns_ip6_targets_clear(struct bonding *bond) { struct in6_addr addr_any = in6addr_any; int i; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) _bond_options_ns_ip6_target_set(bond, i, &addr_any, 0); } static int bond_option_ns_ip6_targets_set(struct bonding *bond, const struct bond_opt_value *newval) { struct in6_addr *target = (struct in6_addr *)newval->extra; struct in6_addr *targets = bond->params.ns_targets; struct in6_addr addr_any = in6addr_any; int index; if (!bond_is_ip6_target_ok(target)) { netdev_err(bond->dev, "invalid NS target %pI6c specified for addition\n", target); return -EINVAL; } if (bond_get_targets_ip6(targets, target) != -1) { /* dup */ netdev_err(bond->dev, "NS target %pI6c is already present\n", target); return -EINVAL; } index = bond_get_targets_ip6(targets, &addr_any); /* first free slot */ if (index == -1) { netdev_err(bond->dev, "NS target table is full!\n"); return -EINVAL; } netdev_dbg(bond->dev, "Adding NS target %pI6c\n", target); _bond_options_ns_ip6_target_set(bond, index, target, jiffies); return 0; } #else static int bond_option_ns_ip6_targets_set(struct bonding *bond, const struct bond_opt_value *newval) { return -EPERM; } static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) {} void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) {} void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) {} #endif static int bond_option_arp_validate_set(struct bonding *bond, const struct bond_opt_value *newval) { bool changed = !!bond->params.arp_validate != !!newval->value; struct list_head *iter; struct slave *slave; netdev_dbg(bond->dev, "Setting arp_validate to %s (%llu)\n", newval->string, newval->value); bond->params.arp_validate = newval->value; if (changed) { bond_for_each_slave(bond, slave, iter) slave_set_ns_maddrs(bond, slave, !!bond->params.arp_validate); } return 0; } static int bond_option_arp_all_targets_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting arp_all_targets to %s (%llu)\n", newval->string, newval->value); bond->params.arp_all_targets = newval->value; return 0; } static int bond_option_missed_max_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting missed max to %s (%llu)\n", newval->string, newval->value); bond->params.missed_max = newval->value; return 0; } static int bond_option_prio_set(struct bonding *bond, const struct bond_opt_value *newval) { struct slave *slave; slave = bond_slave_get_rtnl(newval->slave_dev); if (!slave) { netdev_dbg(newval->slave_dev, "%s called on NULL slave\n", __func__); return -ENODEV; } slave->prio = newval->value; if (rtnl_dereference(bond->primary_slave)) slave_warn(bond->dev, slave->dev, "prio updated, but will not affect failover re-selection as primary slave have been set\n"); else bond_select_active_slave(bond); return 0; } static int bond_option_primary_set(struct bonding *bond, const struct bond_opt_value *newval) { char *p, *primary = newval->string; struct list_head *iter; struct slave *slave; block_netpoll_tx(); p = strchr(primary, '\n'); if (p) *p = '\0'; /* check to see if we are clearing primary */ if (!strlen(primary)) { netdev_dbg(bond->dev, "Setting primary slave to None\n"); RCU_INIT_POINTER(bond->primary_slave, NULL); memset(bond->params.primary, 0, sizeof(bond->params.primary)); bond_select_active_slave(bond); goto out; } bond_for_each_slave(bond, slave, iter) { if (strncmp(slave->dev->name, primary, IFNAMSIZ) == 0) { slave_dbg(bond->dev, slave->dev, "Setting as primary slave\n"); rcu_assign_pointer(bond->primary_slave, slave); strcpy(bond->params.primary, slave->dev->name); bond->force_primary = true; bond_select_active_slave(bond); goto out; } } if (rtnl_dereference(bond->primary_slave)) { netdev_dbg(bond->dev, "Setting primary slave to None\n"); RCU_INIT_POINTER(bond->primary_slave, NULL); bond_select_active_slave(bond); } strscpy_pad(bond->params.primary, primary, IFNAMSIZ); netdev_dbg(bond->dev, "Recording %s as primary, but it has not been enslaved yet\n", primary); out: unblock_netpoll_tx(); return 0; } static int bond_option_primary_reselect_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting primary_reselect to %s (%llu)\n", newval->string, newval->value); bond->params.primary_reselect = newval->value; block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); return 0; } static int bond_option_fail_over_mac_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting fail_over_mac to %s (%llu)\n", newval->string, newval->value); bond->params.fail_over_mac = newval->value; return 0; } static int bond_option_xmit_hash_policy_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting xmit hash policy to %s (%llu)\n", newval->string, newval->value); bond->params.xmit_policy = newval->value; return 0; } static int bond_option_resend_igmp_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting resend_igmp to %llu\n", newval->value); bond->params.resend_igmp = newval->value; return 0; } static int bond_option_num_peer_notif_set(struct bonding *bond, const struct bond_opt_value *newval) { bond->params.num_peer_notif = newval->value; return 0; } static int bond_option_all_slaves_active_set(struct bonding *bond, const struct bond_opt_value *newval) { struct list_head *iter; struct slave *slave; if (newval->value == bond->params.all_slaves_active) return 0; bond->params.all_slaves_active = newval->value; bond_for_each_slave(bond, slave, iter) { if (!bond_is_active_slave(slave)) { if (newval->value) slave->inactive = 0; else slave->inactive = 1; } } return 0; } static int bond_option_min_links_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting min links value to %llu\n", newval->value); bond->params.min_links = newval->value; bond_set_carrier(bond); return 0; } static int bond_option_lp_interval_set(struct bonding *bond, const struct bond_opt_value *newval) { bond->params.lp_interval = newval->value; return 0; } static int bond_option_pps_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting packets per slave to %llu\n", newval->value); bond->params.packets_per_slave = newval->value; if (newval->value > 0) { bond->params.reciprocal_packets_per_slave = reciprocal_value(newval->value); } else { /* reciprocal_packets_per_slave is unused if * packets_per_slave is 0 or 1, just initialize it */ bond->params.reciprocal_packets_per_slave = (struct reciprocal_value) { 0 }; } return 0; } static int bond_option_lacp_active_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting LACP active to %s (%llu)\n", newval->string, newval->value); bond->params.lacp_active = newval->value; bond_3ad_update_lacp_active(bond); return 0; } static int bond_option_lacp_rate_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting LACP rate to %s (%llu)\n", newval->string, newval->value); bond->params.lacp_fast = newval->value; bond_3ad_update_lacp_rate(bond); return 0; } static int bond_option_ad_select_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting ad_select to %s (%llu)\n", newval->string, newval->value); bond->params.ad_select = newval->value; return 0; } static int bond_option_queue_id_set(struct bonding *bond, const struct bond_opt_value *newval) { struct slave *slave, *update_slave; struct net_device *sdev; struct list_head *iter; char *delim; int ret = 0; u16 qid; /* delim will point to queue id if successful */ delim = strchr(newval->string, ':'); if (!delim) goto err_no_cmd; /* Terminate string that points to device name and bump it * up one, so we can read the queue id there. */ *delim = '\0'; if (sscanf(++delim, "%hd\n", &qid) != 1) goto err_no_cmd; /* Check buffer length, valid ifname and queue id */ if (!dev_valid_name(newval->string) || qid > bond->dev->real_num_tx_queues) goto err_no_cmd; /* Get the pointer to that interface if it exists */ sdev = __dev_get_by_name(dev_net(bond->dev), newval->string); if (!sdev) goto err_no_cmd; /* Search for thes slave and check for duplicate qids */ update_slave = NULL; bond_for_each_slave(bond, slave, iter) { if (sdev == slave->dev) /* We don't need to check the matching * slave for dups, since we're overwriting it */ update_slave = slave; else if (qid && qid == slave->queue_id) { goto err_no_cmd; } } if (!update_slave) goto err_no_cmd; /* Actually set the qids for the slave */ WRITE_ONCE(update_slave->queue_id, qid); out: return ret; err_no_cmd: netdev_dbg(bond->dev, "invalid input for queue_id set\n"); ret = -EPERM; goto out; } static int bond_option_slaves_set(struct bonding *bond, const struct bond_opt_value *newval) { char command[IFNAMSIZ + 1] = { 0, }; struct net_device *dev; char *ifname; int ret; sscanf(newval->string, "%16s", command); /* IFNAMSIZ*/ ifname = command + 1; if ((strlen(command) <= 1) || (command[0] != '+' && command[0] != '-') || !dev_valid_name(ifname)) goto err_no_cmd; dev = __dev_get_by_name(dev_net(bond->dev), ifname); if (!dev) { netdev_dbg(bond->dev, "interface %s does not exist!\n", ifname); ret = -ENODEV; goto out; } switch (command[0]) { case '+': slave_dbg(bond->dev, dev, "Enslaving interface\n"); ret = bond_enslave(bond->dev, dev, NULL); break; case '-': slave_dbg(bond->dev, dev, "Releasing interface\n"); ret = bond_release(bond->dev, dev); break; default: /* should not run here. */ goto err_no_cmd; } out: return ret; err_no_cmd: netdev_err(bond->dev, "no command found in slaves file - use +ifname or -ifname\n"); ret = -EPERM; goto out; } static int bond_option_tlb_dynamic_lb_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting dynamic-lb to %s (%llu)\n", newval->string, newval->value); bond->params.tlb_dynamic_lb = newval->value; return 0; } static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting ad_actor_sys_prio to %llu\n", newval->value); bond->params.ad_actor_sys_prio = newval->value; bond_3ad_update_ad_actor_settings(bond); return 0; } static int bond_option_actor_port_prio_set(struct bonding *bond, const struct bond_opt_value *newval) { struct slave *slave; slave = bond_slave_get_rtnl(newval->slave_dev); if (!slave) { netdev_dbg(bond->dev, "%s called on NULL slave\n", __func__); return -ENODEV; } netdev_dbg(newval->slave_dev, "Setting actor_port_prio to %llu\n", newval->value); SLAVE_AD_INFO(slave)->port_priority = newval->value; bond_3ad_update_ad_actor_settings(bond); return 0; } static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval) { u8 macaddr[ETH_ALEN]; u8 *mac; if (newval->string) { if (!mac_pton(newval->string, macaddr)) goto err; mac = macaddr; } else { mac = (u8 *)&newval->value; } if (is_multicast_ether_addr(mac)) goto err; netdev_dbg(bond->dev, "Setting ad_actor_system to %pM\n", mac); ether_addr_copy(bond->params.ad_actor_system, mac); bond_3ad_update_ad_actor_settings(bond); return 0; err: netdev_err(bond->dev, "Invalid ad_actor_system MAC address.\n"); return -EINVAL; } static int bond_option_ad_user_port_key_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_dbg(bond->dev, "Setting ad_user_port_key to %llu\n", newval->value); bond->params.ad_user_port_key = newval->value; return 0; } static int bond_option_coupled_control_set(struct bonding *bond, const struct bond_opt_value *newval) { netdev_info(bond->dev, "Setting coupled_control to %s (%llu)\n", newval->string, newval->value); bond->params.coupled_control = newval->value; return 0; } static int bond_option_broadcast_neigh_set(struct bonding *bond, const struct bond_opt_value *newval) { if (bond->params.broadcast_neighbor == newval->value) return 0; bond->params.broadcast_neighbor = newval->value; if (bond->dev->flags & IFF_UP) { if (bond->params.broadcast_neighbor) static_branch_inc(&bond_bcast_neigh_enabled); else static_branch_dec(&bond_bcast_neigh_enabled); } netdev_dbg(bond->dev, "Setting broadcast_neighbor to %s (%llu)\n", newval->string, newval->value); return 0; }
1 1 4 1 1 1 1 2 2 2 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 2 2 2 2 1 1 1 2 2 4 4 4 4 4 402 1 1 1 1 1 1 8 8 8 8 7 7 8 8 2 2 2 1 1 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 3 3 3 3 3 3 1 3 3 3 3 3 3 2 1 1 1 1 1 1 46 5 43 46 1 9 9 9 5 5 6 5 1 1 1 1 6 8 7 6 8 4 4 4 4 4 4 4 4 4 1 4 4 4 4 4 4 4 4 1 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 4 4 4 4 4 4 3 4 4 4 4 4 4 4 4 4 6 6 3 9 9 10 9 10 3 9 2 10 2 10 9 4 4 9 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 7 8 8 8 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/buffer.c * * Copyright (C) 1991, 1992, 2002 Linus Torvalds */ /* * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 * * Removed a lot of unnecessary code and simplified things now that * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 * * Speed up hash, lru, and free list operations. Use gfp() for allocating * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM * * Added 32k buffer block sizes - these are required older ARM systems. - RMK * * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/mm.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/capability.h> #include <linux/blkdev.h> #include <linux/file.h> #include <linux/quotaops.h> #include <linux/highmem.h> #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/hash.h> #include <linux/suspend.h> #include <linux/buffer_head.h> #include <linux/task_io_accounting_ops.h> #include <linux/bio.h> #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> #include <linux/pagevec.h> #include <linux/sched/mm.h> #include <trace/events/block.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> #include <linux/sched/isolation.h> #include "internal.h" static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) inline void touch_buffer(struct buffer_head *bh) { trace_block_touch_buffer(bh); folio_mark_accessed(bh->b_folio); } EXPORT_SYMBOL(touch_buffer); void __lock_buffer(struct buffer_head *bh) { wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_buffer); void unlock_buffer(struct buffer_head *bh) { clear_bit_unlock(BH_Lock, &bh->b_state); smp_mb__after_atomic(); wake_up_bit(&bh->b_state, BH_Lock); } EXPORT_SYMBOL(unlock_buffer); /* * Returns if the folio has dirty or writeback buffers. If all the buffers * are unlocked and clean then the folio_test_dirty information is stale. If * any of the buffers are locked, it is assumed they are locked for IO. */ void buffer_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct buffer_head *head, *bh; *dirty = false; *writeback = false; BUG_ON(!folio_test_locked(folio)); head = folio_buffers(folio); if (!head) return; if (folio_test_writeback(folio)) *writeback = true; bh = head; do { if (buffer_locked(bh)) *writeback = true; if (buffer_dirty(bh)) *dirty = true; bh = bh->b_this_page; } while (bh != head); } /* * Block until a buffer comes unlocked. This doesn't stop it * from becoming locked again - you have to lock it yourself * if you want to preserve its state. */ void __wait_on_buffer(struct buffer_head * bh) { wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__wait_on_buffer); static void buffer_io_error(struct buffer_head *bh, char *msg) { if (!test_bit(BH_Quiet, &bh->b_state)) printk_ratelimited(KERN_ERR "Buffer I/O error on dev %pg, logical block %llu%s\n", bh->b_bdev, (unsigned long long)bh->b_blocknr, msg); } /* * End-of-IO handler helper function which does not touch the bh after * unlocking it. * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but * a race there is benign: unlock_buffer() only use the bh's address for * hashing after unlocking the buffer, so it doesn't actually touch the bh * itself. */ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) { if (uptodate) { set_buffer_uptodate(bh); } else { /* This happens, due to failed read-ahead attempts. */ clear_buffer_uptodate(bh); } unlock_buffer(bh); } /* * Default synchronous end-of-IO handler.. Just mark it up-to-date and * unlock the buffer. */ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { put_bh(bh); __end_buffer_read_notouch(bh, uptodate); } EXPORT_SYMBOL(end_buffer_read_sync); void end_buffer_write_sync(struct buffer_head *bh, int uptodate) { if (uptodate) { set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost sync page write"); mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } unlock_buffer(bh); put_bh(bh); } EXPORT_SYMBOL(end_buffer_write_sync); static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic) { struct address_space *bd_mapping = bdev->bd_mapping; const int blkbits = bd_mapping->host->i_blkbits; struct buffer_head *ret = NULL; pgoff_t index; struct buffer_head *bh; struct buffer_head *head; struct folio *folio; int all_mapped = 1; static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); index = ((loff_t)block << blkbits) / PAGE_SIZE; folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0); if (IS_ERR(folio)) goto out; /* * Folio lock protects the buffers. Callers that cannot block * will fallback to serializing vs try_to_free_buffers() via * the i_private_lock. */ if (atomic) spin_lock(&bd_mapping->i_private_lock); else folio_lock(folio); head = folio_buffers(folio); if (!head) goto out_unlock; /* * Upon a noref migration, the folio lock serializes here; * otherwise bail. */ if (test_bit_acquire(BH_Migrate, &head->b_state)) { WARN_ON(!atomic); goto out_unlock; } bh = head; do { if (!buffer_mapped(bh)) all_mapped = 0; else if (bh->b_blocknr == block) { ret = bh; get_bh(bh); goto out_unlock; } bh = bh->b_this_page; } while (bh != head); /* we might be here because some of the buffers on this page are * not mapped. This is due to various races between * file io on the block device and getblk. It gets dealt with * elsewhere, don't buffer_error if we had some unmapped buffers */ ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE); if (all_mapped && __ratelimit(&last_warned)) { printk("__find_get_block_slow() failed. block=%llu, " "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, " "device %pg blocksize: %d\n", (unsigned long long)block, (unsigned long long)bh->b_blocknr, bh->b_state, bh->b_size, bdev, 1 << blkbits); } out_unlock: if (atomic) spin_unlock(&bd_mapping->i_private_lock); else folio_unlock(folio); folio_put(folio); out: return ret; } static void end_buffer_async_read(struct buffer_head *bh, int uptodate) { unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; struct folio *folio; int folio_uptodate = 1; BUG_ON(!buffer_async_read(bh)); folio = bh->b_folio; if (uptodate) { set_buffer_uptodate(bh); } else { clear_buffer_uptodate(bh); buffer_io_error(bh, ", async page read"); } /* * Be _very_ careful from here on. Bad things can happen if * two buffer heads end IO at almost the same time and both * decide that the page is now completely done. */ first = folio_buffers(folio); spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; do { if (!buffer_uptodate(tmp)) folio_uptodate = 0; if (buffer_async_read(tmp)) { BUG_ON(!buffer_locked(tmp)); goto still_busy; } tmp = tmp->b_this_page; } while (tmp != bh); spin_unlock_irqrestore(&first->b_uptodate_lock, flags); folio_end_read(folio, folio_uptodate); return; still_busy: spin_unlock_irqrestore(&first->b_uptodate_lock, flags); } struct postprocess_bh_ctx { struct work_struct work; struct buffer_head *bh; }; static void verify_bh(struct work_struct *work) { struct postprocess_bh_ctx *ctx = container_of(work, struct postprocess_bh_ctx, work); struct buffer_head *bh = ctx->bh; bool valid; valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh)); end_buffer_async_read(bh, valid); kfree(ctx); } static bool need_fsverity(struct buffer_head *bh) { struct folio *folio = bh->b_folio; struct inode *inode = folio->mapping->host; return fsverity_active(inode) && /* needed by ext4 */ folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } static void decrypt_bh(struct work_struct *work) { struct postprocess_bh_ctx *ctx = container_of(work, struct postprocess_bh_ctx, work); struct buffer_head *bh = ctx->bh; int err; err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size, bh_offset(bh)); if (err == 0 && need_fsverity(bh)) { /* * We use different work queues for decryption and for verity * because verity may require reading metadata pages that need * decryption, and we shouldn't recurse to the same workqueue. */ INIT_WORK(&ctx->work, verify_bh); fsverity_enqueue_verify_work(&ctx->work); return; } end_buffer_async_read(bh, err == 0); kfree(ctx); } /* * I/O completion handler for block_read_full_folio() - pages * which come unlocked at the end of I/O. */ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate) { struct inode *inode = bh->b_folio->mapping->host; bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode); bool verify = need_fsverity(bh); /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */ if (uptodate && (decrypt || verify)) { struct postprocess_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); if (ctx) { ctx->bh = bh; if (decrypt) { INIT_WORK(&ctx->work, decrypt_bh); fscrypt_enqueue_decrypt_work(&ctx->work); } else { INIT_WORK(&ctx->work, verify_bh); fsverity_enqueue_verify_work(&ctx->work); } return; } uptodate = 0; } end_buffer_async_read(bh, uptodate); } /* * Completion handler for block_write_full_folio() - folios which are unlocked * during I/O, and which have the writeback flag cleared upon I/O completion. */ static void end_buffer_async_write(struct buffer_head *bh, int uptodate) { unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; struct folio *folio; BUG_ON(!buffer_async_write(bh)); folio = bh->b_folio; if (uptodate) { set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost async page write"); mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } first = folio_buffers(folio); spin_lock_irqsave(&first->b_uptodate_lock, flags); clear_buffer_async_write(bh); unlock_buffer(bh); tmp = bh->b_this_page; while (tmp != bh) { if (buffer_async_write(tmp)) { BUG_ON(!buffer_locked(tmp)); goto still_busy; } tmp = tmp->b_this_page; } spin_unlock_irqrestore(&first->b_uptodate_lock, flags); folio_end_writeback(folio); return; still_busy: spin_unlock_irqrestore(&first->b_uptodate_lock, flags); } /* * If a page's buffers are under async readin (end_buffer_async_read * completion) then there is a possibility that another thread of * control could lock one of the buffers after it has completed * but while some of the other buffers have not completed. This * locked buffer would confuse end_buffer_async_read() into not unlocking * the page. So the absence of BH_Async_Read tells end_buffer_async_read() * that this buffer is not under async I/O. * * The page comes unlocked when it has no locked buffer_async buffers * left. * * PageLocked prevents anyone starting new async I/O reads any of * the buffers. * * PageWriteback is used to prevent simultaneous writeout of the same * page. * * PageLocked prevents anyone from starting writeback of a page which is * under read I/O (PageWriteback is only ever set against a locked page). */ static void mark_buffer_async_read(struct buffer_head *bh) { bh->b_end_io = end_buffer_async_read_io; set_buffer_async_read(bh); } static void mark_buffer_async_write_endio(struct buffer_head *bh, bh_end_io_t *handler) { bh->b_end_io = handler; set_buffer_async_write(bh); } void mark_buffer_async_write(struct buffer_head *bh) { mark_buffer_async_write_endio(bh, end_buffer_async_write); } EXPORT_SYMBOL(mark_buffer_async_write); /* * fs/buffer.c contains helper functions for buffer-backed address space's * fsync functions. A common requirement for buffer-based filesystems is * that certain data from the backing blockdev needs to be written out for * a successful fsync(). For example, ext2 indirect blocks need to be * written back and waited upon before fsync() returns. * * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the * management of a list of dependent buffers at ->i_mapping->i_private_list. * * Locking is a little subtle: try_to_free_buffers() will remove buffers * from their controlling inode's queue when they are being freed. But * try_to_free_buffers() will be operating against the *blockdev* mapping * at the time, not against the S_ISREG file which depends on those buffers. * So the locking for i_private_list is via the i_private_lock in the address_space * which backs the buffers. Which is different from the address_space * against which the buffers are listed. So for a particular address_space, * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact, * mapping->i_private_list will always be protected by the backing blockdev's * ->i_private_lock. * * Which introduces a requirement: all buffers on an address_space's * ->i_private_list must be from the same address_space: the blockdev's. * * address_spaces which do not place buffers at ->i_private_list via these * utility functions are free to use i_private_lock and i_private_list for * whatever they want. The only requirement is that list_empty(i_private_list) * be true at clear_inode() time. * * FIXME: clear_inode should not call invalidate_inode_buffers(). The * filesystems should do that. invalidate_inode_buffers() should just go * BUG_ON(!list_empty). * * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should * take an address_space, not an inode. And it should be called * mark_buffer_dirty_fsync() to clearly define why those buffers are being * queued up. * * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the * list if it is already on a list. Because if the buffer is on a list, * it *must* already be on the right one. If not, the filesystem is being * silly. This will save a ton of locking. But first we have to ensure * that buffers are taken *off* the old inode's list when they are freed * (presumably in truncate). That requires careful auditing of all * filesystems (do it inside bforget()). It could also be done by bringing * b_inode back. */ /* * The buffer's backing address_space's i_private_lock must be held */ static void __remove_assoc_queue(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); WARN_ON(!bh->b_assoc_map); bh->b_assoc_map = NULL; } int inode_has_buffers(struct inode *inode) { return !list_empty(&inode->i_data.i_private_list); } /* * osync is designed to support O_SYNC io. It waits synchronously for * all already-submitted IO to complete, but does not queue any new * writes to the disk. * * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer * as you dirty the buffers, and then use osync_inode_buffers to wait for * completion. Any other dirty buffers which are not yet queued for * write will not be flushed to disk by the osync. */ static int osync_buffers_list(spinlock_t *lock, struct list_head *list) { struct buffer_head *bh; struct list_head *p; int err = 0; spin_lock(lock); repeat: list_for_each_prev(p, list) { bh = BH_ENTRY(p); if (buffer_locked(bh)) { get_bh(bh); spin_unlock(lock); wait_on_buffer(bh); if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); spin_lock(lock); goto repeat; } } spin_unlock(lock); return err; } /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written * * Starts I/O against the buffers at mapping->i_private_list, and waits upon * that I/O. * * Basically, this is a convenience function for fsync(). * @mapping is a file or directory which needs those buffers to be written for * a successful fsync(). */ int sync_mapping_buffers(struct address_space *mapping) { struct address_space *buffer_mapping = mapping->i_private_data; if (buffer_mapping == NULL || list_empty(&mapping->i_private_list)) return 0; return fsync_buffers_list(&buffer_mapping->i_private_lock, &mapping->i_private_list); } EXPORT_SYMBOL(sync_mapping_buffers); /** * generic_buffers_fsync_noflush - generic buffer fsync implementation * for simple filesystems with no inode lock * * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. */ int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, bool datasync) { struct inode *inode = file->f_mapping->host; int err; int ret; err = file_write_and_wait_range(file, start, end); if (err) return err; ret = sync_mapping_buffers(inode->i_mapping); if (!(inode_state_read_once(inode) & I_DIRTY_ALL)) goto out; if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; out: /* check and advance again to catch errors after syncing out buffers */ err = file_check_and_advance_wb_err(file); if (ret == 0) ret = err; return ret; } EXPORT_SYMBOL(generic_buffers_fsync_noflush); /** * generic_buffers_fsync - generic buffer fsync implementation * for simple filesystems with no inode lock * * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. This also makes sure that * a device cache flush operation is called at the end. */ int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, bool datasync) { struct inode *inode = file->f_mapping->host; int ret; ret = generic_buffers_fsync_noflush(file, start, end, datasync); if (!ret) ret = blkdev_issue_flush(inode->i_sb->s_bdev); return ret; } EXPORT_SYMBOL(generic_buffers_fsync); /* * Called when we've recently written block `bblock', and it is known that * `bblock' was for a buffer_boundary() buffer. This means that the block at * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's * dirty, schedule it for IO. So that indirects merge nicely with their data. */ void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize) { struct buffer_head *bh; bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) write_dirty_buffer(bh, 0); put_bh(bh); } } void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) { struct address_space *mapping = inode->i_mapping; struct address_space *buffer_mapping = bh->b_folio->mapping; mark_buffer_dirty(bh); if (!mapping->i_private_data) { mapping->i_private_data = buffer_mapping; } else { BUG_ON(mapping->i_private_data != buffer_mapping); } if (!bh->b_assoc_map) { spin_lock(&buffer_mapping->i_private_lock); list_move_tail(&bh->b_assoc_buffers, &mapping->i_private_list); bh->b_assoc_map = mapping; spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(mark_buffer_dirty_inode); /** * block_dirty_folio - Mark a folio as dirty. * @mapping: The address space containing this folio. * @folio: The folio to mark dirty. * * Filesystems which use buffer_heads can use this function as their * ->dirty_folio implementation. Some filesystems need to do a little * work before calling this function. Filesystems which do not use * buffer_heads should call filemap_dirty_folio() instead. * * If the folio has buffers, the uptodate buffers are set dirty, to * preserve dirty-state coherency between the folio and the buffers. * Buffers added to a dirty folio are created dirty. * * The buffers are dirtied before the folio is dirtied. There's a small * race window in which writeback may see the folio cleanness but not the * buffer dirtiness. That's fine. If this code were to set the folio * dirty before the buffers, writeback could clear the folio dirty flag, * see a bunch of clean buffers and we'd end up with dirty buffers/clean * folio on the dirty folio list. * * We use i_private_lock to lock against try_to_free_buffers() while * using the folio's buffer list. This also prevents clean buffers * being added to the folio after it was set dirty. * * Context: May only be called from process context. Does not sleep. * Caller must ensure that @folio cannot be truncated during this call, * typically by holding the folio lock or having a page in the folio * mapped and holding the page table lock. * * Return: True if the folio was dirtied; false if it was already dirtied. */ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) { struct buffer_head *head; bool newly_dirty; spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; do { set_buffer_dirty(bh); bh = bh->b_this_page; } while (bh != head); } /* * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ newly_dirty = !folio_test_set_dirty(folio); spin_unlock(&mapping->i_private_lock); if (newly_dirty) __folio_mark_dirty(folio, mapping, 1); if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return newly_dirty; } EXPORT_SYMBOL(block_dirty_folio); /* * Write out and wait upon a list of buffers. * * We have conflicting pressures: we want to make sure that all * initially dirty buffers get waited on, but that any subsequently * dirtied buffers don't. After all, we don't want fsync to last * forever if somebody is actively writing to the file. * * Do this in two main stages: first we copy dirty buffers to a * temporary inode list, queueing the writes as we go. Then we clean * up, waiting for those writes to complete. * * During this second stage, any subsequent updates to the file may end * up refiling the buffer on the original inode's dirty list again, so * there is a chance we will end up with a buffer queued for write but * not yet completed on that list. So, as a final cleanup we go through * the osync code to catch these locked, dirty buffers without requeuing * any newly dirty buffers for write. */ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) { struct buffer_head *bh; struct address_space *mapping; int err = 0, err2; struct blk_plug plug; LIST_HEAD(tmp); blk_start_plug(&plug); spin_lock(lock); while (!list_empty(list)) { bh = BH_ENTRY(list->next); mapping = bh->b_assoc_map; __remove_assoc_queue(bh); /* Avoid race with mark_buffer_dirty_inode() which does * a lockless check and we rely on seeing the dirty bit */ smp_mb(); if (buffer_dirty(bh) || buffer_locked(bh)) { list_add(&bh->b_assoc_buffers, &tmp); bh->b_assoc_map = mapping; if (buffer_dirty(bh)) { get_bh(bh); spin_unlock(lock); /* * Ensure any pending I/O completes so that * write_dirty_buffer() actually writes the * current contents - it is a noop if I/O is * still in flight on potentially older * contents. */ write_dirty_buffer(bh, REQ_SYNC); /* * Kick off IO for the previous mapping. Note * that we will not run the very last mapping, * wait_on_buffer() will do that for us * through sync_buffer(). */ brelse(bh); spin_lock(lock); } } } spin_unlock(lock); blk_finish_plug(&plug); spin_lock(lock); while (!list_empty(&tmp)) { bh = BH_ENTRY(tmp.prev); get_bh(bh); mapping = bh->b_assoc_map; __remove_assoc_queue(bh); /* Avoid race with mark_buffer_dirty_inode() which does * a lockless check and we rely on seeing the dirty bit */ smp_mb(); if (buffer_dirty(bh)) { list_add(&bh->b_assoc_buffers, &mapping->i_private_list); bh->b_assoc_map = mapping; } spin_unlock(lock); wait_on_buffer(bh); if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); spin_lock(lock); } spin_unlock(lock); err2 = osync_buffers_list(lock, list); if (err) return err; else return err2; } /* * Invalidate any and all dirty buffers on a given inode. We are * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. * * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which * assumes that all the buffers are against the blockdev. */ void invalidate_inode_buffers(struct inode *inode) { if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->i_private_list; struct address_space *buffer_mapping = mapping->i_private_data; spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) __remove_assoc_queue(BH_ENTRY(list->next)); spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(invalidate_inode_buffers); /* * Remove any clean buffers from the inode's buffer list. This is called * when we're trying to free the inode itself. Those buffers can pin it. * * Returns true if all buffers were removed. */ int remove_inode_buffers(struct inode *inode) { int ret = 1; if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->i_private_list; struct address_space *buffer_mapping = mapping->i_private_data; spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) { struct buffer_head *bh = BH_ENTRY(list->next); if (buffer_dirty(bh)) { ret = 0; break; } __remove_assoc_queue(bh); } spin_unlock(&buffer_mapping->i_private_lock); } return ret; } /* * Create the appropriate buffers when given a folio for data area and * the size of each buffer.. Use the bh->b_this_page linked list to * follow the buffers created. Return NULL if unable to create more * buffers. * * The retry flag is used to differentiate async IO (paging, swapping) * which may not fail from ordinary buffer allocations. */ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, gfp_t gfp) { struct buffer_head *bh, *head; long offset; struct mem_cgroup *memcg, *old_memcg; /* The folio lock pins the memcg */ memcg = folio_memcg(folio); old_memcg = set_active_memcg(memcg); head = NULL; offset = folio_size(folio); while ((offset -= size) >= 0) { bh = alloc_buffer_head(gfp); if (!bh) goto no_grow; bh->b_this_page = head; bh->b_blocknr = -1; head = bh; bh->b_size = size; /* Link the buffer to its folio */ folio_set_bh(bh, folio, offset); } out: set_active_memcg(old_memcg); return head; /* * In case anything failed, we just free everything we got. */ no_grow: if (head) { do { bh = head; head = head->b_this_page; free_buffer_head(bh); } while (head); } goto out; } EXPORT_SYMBOL_GPL(folio_alloc_buffers); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size) { gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; return folio_alloc_buffers(page_folio(page), size, gfp); } EXPORT_SYMBOL_GPL(alloc_page_buffers); static inline void link_dev_buffers(struct folio *folio, struct buffer_head *head) { struct buffer_head *bh, *tail; bh = head; do { tail = bh; bh = bh->b_this_page; } while (bh); tail->b_this_page = head; folio_attach_private(folio, head); } static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) { sector_t retval = ~((sector_t)0); loff_t sz = bdev_nr_bytes(bdev); if (sz) { unsigned int sizebits = blksize_bits(size); retval = (sz >> sizebits); } return retval; } /* * Initialise the state of a blockdev folio's buffers. */ static sector_t folio_init_buffers(struct folio *folio, struct block_device *bdev, unsigned size) { struct buffer_head *head = folio_buffers(folio); struct buffer_head *bh = head; bool uptodate = folio_test_uptodate(folio); sector_t block = div_u64(folio_pos(folio), size); sector_t end_block = blkdev_max_block(bdev, size); do { if (!buffer_mapped(bh)) { bh->b_end_io = NULL; bh->b_private = NULL; bh->b_bdev = bdev; bh->b_blocknr = block; if (uptodate) set_buffer_uptodate(bh); if (block < end_block) set_buffer_mapped(bh); } block++; bh = bh->b_this_page; } while (bh != head); /* * Caller needs to validate requested block against end of device. */ return end_block; } /* * Create the page-cache folio that contains the requested block. * * This is used purely for blockdev mappings. * * Returns false if we have a failure which cannot be cured by retrying * without sleeping. Returns true if we succeeded, or the caller should retry. */ static bool grow_dev_folio(struct block_device *bdev, sector_t block, pgoff_t index, unsigned size, gfp_t gfp) { struct address_space *mapping = bdev->bd_mapping; struct folio *folio; struct buffer_head *bh; sector_t end_block = 0; folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return false; bh = folio_buffers(folio); if (bh) { if (bh->b_size == size) { end_block = folio_init_buffers(folio, bdev, size); goto unlock; } /* * Retrying may succeed; for example the folio may finish * writeback, or buffers may be cleaned. This should not * happen very often; maybe we have old buffers attached to * this blockdev's page cache and we're trying to change * the block size? */ if (!try_to_free_buffers(folio)) { end_block = ~0ULL; goto unlock; } } bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT); if (!bh) goto unlock; /* * Link the folio to the buffers and initialise them. Take the * lock to be atomic wrt __find_get_block(), which does not * run under the folio lock. */ spin_lock(&mapping->i_private_lock); link_dev_buffers(folio, bh); end_block = folio_init_buffers(folio, bdev, size); spin_unlock(&mapping->i_private_lock); unlock: folio_unlock(folio); folio_put(folio); return block < end_block; } /* * Create buffers for the specified block device block's folio. If * that folio was dirty, the buffers are set dirty also. Returns false * if we've hit a permanent error. */ static bool grow_buffers(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { loff_t pos; /* * Check for a block which lies outside our maximum possible * pagecache index. */ if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) { printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n", __func__, (unsigned long long)block, bdev); return false; } /* Create a folio with the proper size buffers */ return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp); } static struct buffer_head * __getblk_slow(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { bool blocking = gfpflags_allow_blocking(gfp); if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) { printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n", size, bdev_logical_block_size(bdev)); return NULL; } for (;;) { struct buffer_head *bh; if (!grow_buffers(bdev, block, size, gfp)) return NULL; if (blocking) bh = __find_get_block_nonatomic(bdev, block, size); else bh = __find_get_block(bdev, block, size); if (bh) return bh; } } /* * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and * the page is tagged dirty in the page cache. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is * merely a hint about the true dirty state. * * When a page is set dirty in its entirety, all its buffers are marked dirty * (if the page has buffers). * * When a buffer is marked dirty, its page is dirtied, but the page's other * buffers are not. * * Also. When blockdev buffers are explicitly read with bread(), they * individually become uptodate. But their backing page remains not * uptodate - even if all of its buffers are uptodate. A subsequent * block_read_full_folio() against that folio will discover all the uptodate * buffers, will set the folio uptodate and will perform no I/O. */ /** * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * * mark_buffer_dirty() will set the dirty bit against the buffer, then set * its backing page dirty, then tag the page as dirty in the page cache * and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock, * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { WARN_ON_ONCE(!buffer_uptodate(bh)); trace_block_dirty_buffer(bh); /* * Very *carefully* optimize the it-is-already-dirty case. * * Don't let the final "is it dirty" escape to before we * perhaps modified the buffer. */ if (buffer_dirty(bh)) { smp_mb(); if (buffer_dirty(bh)) return; } if (!test_set_buffer_dirty(bh)) { struct folio *folio = bh->b_folio; struct address_space *mapping = NULL; if (!folio_test_set_dirty(folio)) { mapping = folio->mapping; if (mapping) __folio_mark_dirty(folio, mapping, 0); } if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } } EXPORT_SYMBOL(mark_buffer_dirty); void mark_buffer_write_io_error(struct buffer_head *bh) { set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_folio && bh->b_folio->mapping) mapping_set_error(bh->b_folio->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); } EXPORT_SYMBOL(mark_buffer_write_io_error); /** * __brelse - Release a buffer. * @bh: The buffer to release. * * This variant of brelse() can be called if @bh is guaranteed to not be NULL. */ void __brelse(struct buffer_head *bh) { if (atomic_read(&bh->b_count)) { put_bh(bh); return; } WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); } EXPORT_SYMBOL(__brelse); /** * __bforget - Discard any dirty data in a buffer. * @bh: The buffer to forget. * * This variant of bforget() can be called if @bh is guaranteed to not * be NULL. */ void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); if (bh->b_assoc_map) { struct address_space *buffer_mapping = bh->b_folio->mapping; spin_lock(&buffer_mapping->i_private_lock); list_del_init(&bh->b_assoc_buffers); bh->b_assoc_map = NULL; spin_unlock(&buffer_mapping->i_private_lock); } __brelse(bh); } EXPORT_SYMBOL(__bforget); static struct buffer_head *__bread_slow(struct buffer_head *bh) { lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); return bh; } else { get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; } brelse(bh); return NULL; } /* * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their * refcount elevated by one when they're in an LRU. A buffer can only appear * once in a particular CPU's LRU. A single buffer can be present in multiple * CPU's LRUs at the same time. * * This is a transparent caching front-end to sb_bread(), sb_getblk() and * sb_find_get_block(). * * The LRUs themselves only need locking against invalidate_bh_lrus. We use * a local interrupt disable for that. */ #define BH_LRU_SIZE 16 struct bh_lru { struct buffer_head *bhs[BH_LRU_SIZE]; }; static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; #ifdef CONFIG_SMP #define bh_lru_lock() local_irq_disable() #define bh_lru_unlock() local_irq_enable() #else #define bh_lru_lock() preempt_disable() #define bh_lru_unlock() preempt_enable() #endif static inline void check_irqs_on(void) { #ifdef irqs_disabled BUG_ON(irqs_disabled()); #endif } /* * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is * inserted at the front, and the buffer_head at the back if any is evicted. * Or, if already in the LRU it is moved to the front. */ static void bh_lru_install(struct buffer_head *bh) { struct buffer_head *evictee = bh; struct bh_lru *b; int i; check_irqs_on(); bh_lru_lock(); /* * the refcount of buffer_head in bh_lru prevents dropping the * attached page(i.e., try_to_free_buffers) so it could cause * failing page migration. * Skip putting upcoming bh into bh_lru until migration is done. */ if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) { bh_lru_unlock(); return; } b = this_cpu_ptr(&bh_lrus); for (i = 0; i < BH_LRU_SIZE; i++) { swap(evictee, b->bhs[i]); if (evictee == bh) { bh_lru_unlock(); return; } } get_bh(bh); bh_lru_unlock(); brelse(evictee); } /* * Look up the bh in this cpu's LRU. If it's there, move it to the head. */ static struct buffer_head * lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *ret = NULL; unsigned int i; check_irqs_on(); bh_lru_lock(); if (cpu_is_isolated(smp_processor_id())) { bh_lru_unlock(); return NULL; } for (i = 0; i < BH_LRU_SIZE; i++) { struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); if (bh && bh->b_blocknr == block && bh->b_bdev == bdev && bh->b_size == size) { if (i) { while (i) { __this_cpu_write(bh_lrus.bhs[i], __this_cpu_read(bh_lrus.bhs[i - 1])); i--; } __this_cpu_write(bh_lrus.bhs[0], bh); } get_bh(bh); ret = bh; break; } } bh_lru_unlock(); return ret; } /* * Perform a pagecache lookup for the matching buffer. If it's there, refresh * it in the LRU and mark it as accessed. If it is not present then return * NULL. Atomic context callers may also return NULL if the buffer is being * migrated; similarly the page is not marked accessed either. */ static struct buffer_head * find_get_block_common(struct block_device *bdev, sector_t block, unsigned size, bool atomic) { struct buffer_head *bh = lookup_bh_lru(bdev, block, size); if (bh == NULL) { /* __find_get_block_slow will mark the page accessed */ bh = __find_get_block_slow(bdev, block, atomic); if (bh) bh_lru_install(bh); } else touch_buffer(bh); return bh; } struct buffer_head * __find_get_block(struct block_device *bdev, sector_t block, unsigned size) { return find_get_block_common(bdev, block, size, true); } EXPORT_SYMBOL(__find_get_block); /* same as __find_get_block() but allows sleeping contexts */ struct buffer_head * __find_get_block_nonatomic(struct block_device *bdev, sector_t block, unsigned size) { return find_get_block_common(bdev, block, size, false); } EXPORT_SYMBOL(__find_get_block_nonatomic); /** * bdev_getblk - Get a buffer_head in a block device's buffer cache. * @bdev: The block device. * @block: The block number. * @size: The size of buffer_heads for this @bdev. * @gfp: The memory allocation flags to use. * * The returned buffer head has its reference count incremented, but is * not locked. The caller should call brelse() when it has finished * with the buffer. The buffer may not be uptodate. If needed, the * caller can bring it uptodate either by reading it or overwriting it. * * Return: The buffer head, or NULL if memory could not be allocated. */ struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { struct buffer_head *bh; if (gfpflags_allow_blocking(gfp)) bh = __find_get_block_nonatomic(bdev, block, size); else bh = __find_get_block(bdev, block, size); might_alloc(gfp); if (bh) return bh; return __getblk_slow(bdev, block, size, gfp); } EXPORT_SYMBOL(bdev_getblk); /* * Do async read-ahead on a buffer.. */ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = bdev_getblk(bdev, block, size, GFP_NOWAIT | __GFP_MOVABLE); if (likely(bh)) { bh_readahead(bh, REQ_RAHEAD); brelse(bh); } } EXPORT_SYMBOL(__breadahead); /** * __bread_gfp() - Read a block. * @bdev: The block device to read from. * @block: Block number in units of block size. * @size: The block size of this device in bytes. * @gfp: Not page allocation flags; see below. * * You are not expected to call this function. You should use one of * sb_bread(), sb_bread_unmovable() or __bread(). * * Read a specified block, and return the buffer head that refers to it. * If @gfp is 0, the memory will be allocated using the block device's * default GFP flags. If @gfp is __GFP_MOVABLE, the memory may be * allocated from a movable area. Do not pass in a complete set of * GFP flags. * * The returned buffer head has its refcount increased. The caller should * call brelse() when it has finished with the buffer. * * Context: May sleep waiting for I/O. * Return: NULL if the block was unreadable. */ struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { struct buffer_head *bh; gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS); /* * Prefer looping in the allocator rather than here, at least that * code knows what it's doing. */ gfp |= __GFP_NOFAIL; bh = bdev_getblk(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); return bh; } EXPORT_SYMBOL(__bread_gfp); static void __invalidate_bh_lrus(struct bh_lru *b) { int i; for (i = 0; i < BH_LRU_SIZE; i++) { brelse(b->bhs[i]); b->bhs[i] = NULL; } } /* * invalidate_bh_lrus() is called rarely - but not only at unmount. * This doesn't race because it runs in each cpu either in irq * or with preempt disabled. */ static void invalidate_bh_lru(void *arg) { struct bh_lru *b = &get_cpu_var(bh_lrus); __invalidate_bh_lrus(b); put_cpu_var(bh_lrus); } bool has_bh_in_lru(int cpu, void *dummy) { struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); int i; for (i = 0; i < BH_LRU_SIZE; i++) { if (b->bhs[i]) return true; } return false; } void invalidate_bh_lrus(void) { on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1); } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); /* * It's called from workqueue context so we need a bh_lru_lock to close * the race with preemption/irq. */ void invalidate_bh_lrus_cpu(void) { struct bh_lru *b; bh_lru_lock(); b = this_cpu_ptr(&bh_lrus); __invalidate_bh_lrus(b); bh_lru_unlock(); } void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset) { bh->b_folio = folio; BUG_ON(offset >= folio_size(folio)); if (folio_test_highmem(folio)) /* * This catches illegal uses and preserves the offset: */ bh->b_data = (char *)(0 + offset); else bh->b_data = folio_address(folio) + offset; } EXPORT_SYMBOL(folio_set_bh); /* * Called when truncating a buffer on a page completely. */ /* Bits that are cleared during an invalidate */ #define BUFFER_FLAGS_DISCARD \ (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ 1 << BH_Delay | 1 << BH_Unwritten) static void discard_buffer(struct buffer_head * bh) { unsigned long b_state; lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; b_state = READ_ONCE(bh->b_state); do { } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state, b_state & ~BUFFER_FLAGS_DISCARD)); unlock_buffer(bh); } /** * block_invalidate_folio - Invalidate part or all of a buffer-backed folio. * @folio: The folio which is affected. * @offset: start of the range to invalidate * @length: length of the range to invalidate * * block_invalidate_folio() is called when all or part of the folio has been * invalidated by a truncate operation. * * block_invalidate_folio() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct buffer_head *head, *bh, *next; size_t curr_off = 0; size_t stop = length + offset; BUG_ON(!folio_test_locked(folio)); /* * Check for overflow */ BUG_ON(stop > folio_size(folio) || stop < length); head = folio_buffers(folio); if (!head) return; bh = head; do { size_t next_off = curr_off + bh->b_size; next = bh->b_this_page; /* * Are we still fully in range ? */ if (next_off > stop) goto out; /* * is this block fully invalidated? */ if (offset <= curr_off) discard_buffer(bh); curr_off = next_off; bh = next; } while (bh != head); /* * We release buffers only if the entire folio is being invalidated. * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */ if (length == folio_size(folio)) filemap_release_folio(folio, 0); out: folio_clear_mappedtodisk(folio); } EXPORT_SYMBOL(block_invalidate_folio); /* * We attach and possibly dirty the buffers atomically wrt * block_dirty_folio() via i_private_lock. try_to_free_buffers * is already excluded via the folio lock. */ struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL; head = folio_alloc_buffers(folio, blocksize, gfp); bh = head; do { bh->b_state |= b_state; tail = bh; bh = bh->b_this_page; } while (bh); tail->b_this_page = head; spin_lock(&folio->mapping->i_private_lock); if (folio_test_uptodate(folio) || folio_test_dirty(folio)) { bh = head; do { if (folio_test_dirty(folio)) set_buffer_dirty(bh); if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); bh = bh->b_this_page; } while (bh != head); } folio_attach_private(folio, head); spin_unlock(&folio->mapping->i_private_lock); return head; } EXPORT_SYMBOL(create_empty_buffers); /** * clean_bdev_aliases: clean a range of buffers in block device * @bdev: Block device to clean buffers in * @block: Start of a range of blocks to clean * @len: Number of blocks to clean * * We are taking a range of blocks for data and we don't want writeback of any * buffer-cache aliases starting from return from this function and until the * moment when something will explicitly mark the buffer dirty (hopefully that * will not happen until we will free that block ;-) We don't even need to mark * it not-uptodate - nobody can expect anything from a newly allocated buffer * anyway. We used to use unmap_buffer() for such invalidation, but that was * wrong. We definitely don't want to mark the alias unmapped, for example - it * would confuse anyone who might pick it with bread() afterwards... * * Also.. Note that bforget() doesn't lock the buffer. So there can be * writeout I/O going on against recently-freed buffers. We don't wait on that * I/O in bforget() - it's more efficient to wait on the I/O only if we really * need to. That happens here. */ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) { struct address_space *bd_mapping = bdev->bd_mapping; const int blkbits = bd_mapping->host->i_blkbits; struct folio_batch fbatch; pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE; pgoff_t end; int i, count; struct buffer_head *bh; struct buffer_head *head; end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE; folio_batch_init(&fbatch); while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) { count = folio_batch_count(&fbatch); for (i = 0; i < count; i++) { struct folio *folio = fbatch.folios[i]; if (!folio_buffers(folio)) continue; /* * We use folio lock instead of bd_mapping->i_private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock. */ folio_lock(folio); /* Recheck when the folio is locked which pins bhs */ head = folio_buffers(folio); if (!head) goto unlock_page; bh = head; do { if (!buffer_mapped(bh) || (bh->b_blocknr < block)) goto next; if (bh->b_blocknr >= block + len) break; clear_buffer_dirty(bh); wait_on_buffer(bh); clear_buffer_req(bh); next: bh = bh->b_this_page; } while (bh != head); unlock_page: folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); /* End of range already reached? */ if (index > end || !index) break; } } EXPORT_SYMBOL(clean_bdev_aliases); static struct buffer_head *folio_create_buffers(struct folio *folio, struct inode *inode, unsigned int b_state) { struct buffer_head *bh; BUG_ON(!folio_test_locked(folio)); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, 1 << READ_ONCE(inode->i_blkbits), b_state); return bh; } /* * NOTE! All mapped/uptodate combinations are valid: * * Mapped Uptodate Meaning * * No No "unknown" - must do get_block() * No Yes "hole" - zero-filled * Yes No "allocated" - allocated on disk, not read in * Yes Yes "valid" - allocated and up-to-date in memory. * * "Dirty" is valid only with the last case (mapped+uptodate). */ /* * While block_write_full_folio is writing back the dirty buffers under * the page lock, whoever dirtied the buffers may decide to clean them * again at any time. We handle that by only looking at the buffer * state inside lock_buffer(). * * If block_write_full_folio() is called for regular writeback * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. * * If block_write_full_folio() is called with wbc->sync_mode == * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */ int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc) { int err; sector_t block; sector_t last_block; struct buffer_head *bh, *head; size_t blocksize; int nr_underway = 0; blk_opf_t write_flags = wbc_to_write_flags(wbc); head = folio_create_buffers(folio, inode, (1 << BH_Dirty) | (1 << BH_Uptodate)); /* * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the folio stays dirty. * * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them. */ bh = head; blocksize = bh->b_size; block = div_u64(folio_pos(folio), blocksize); last_block = div_u64(i_size_read(inode) - 1, blocksize); /* * Get all the dirty buffers mapped to disk addresses and * handle any aliases from the underlying blockdev's mapping. */ do { if (block > last_block) { /* * mapped buffers outside i_size will occur, because * this folio can be outside i_size when there is a * truncate in progress. */ /* * The buffer was zeroed by block_write_full_folio() */ clear_buffer_dirty(bh); set_buffer_uptodate(bh); } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) goto recover; clear_buffer_delay(bh); if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); clean_bdev_bh_alias(bh); } } bh = bh->b_this_page; block++; } while (bh != head); do { if (!buffer_mapped(bh)) continue; /* * If it's a fully non-blocking write attempt and we cannot * lock the buffer then redirty the folio. Note that this can * potentially cause a busy-wait loop from writeback threads * and kswapd activity, but those code paths have their own * higher-level throttling. */ if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { folio_redirty_for_writepage(wbc, folio); continue; } if (test_clear_buffer_dirty(bh)) { mark_buffer_async_write_endio(bh, end_buffer_async_write); } else { unlock_buffer(bh); } } while ((bh = bh->b_this_page) != head); /* * The folio and its buffers are protected by the writeback flag, * so we can drop the bh refcounts early. */ BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, inode->i_write_hint, wbc); nr_underway++; } bh = next; } while (bh != head); folio_unlock(folio); err = 0; done: if (nr_underway == 0) { /* * The folio was marked dirty, but the buffers were * clean. Someone wrote them back by hand with * write_dirty_buffer/submit_bh. A rare case. */ folio_end_writeback(folio); /* * The folio and buffer_heads can be released at any time from * here on. */ } return err; recover: /* * ENOSPC, or some other error. We may already have added some * blocks to the file, so we need to write these out to avoid * exposing stale data. * The folio is currently locked and not marked for writeback */ bh = head; /* Recovery: lock and submit the mapped buffers */ do { if (buffer_mapped(bh) && buffer_dirty(bh) && !buffer_delay(bh)) { lock_buffer(bh); mark_buffer_async_write_endio(bh, end_buffer_async_write); } else { /* * The buffer may have been set dirty during * attachment to a dirty folio. */ clear_buffer_dirty(bh); } } while ((bh = bh->b_this_page) != head); BUG_ON(folio_test_writeback(folio)); mapping_set_error(folio->mapping, err); folio_start_writeback(folio); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, inode->i_write_hint, wbc); nr_underway++; } bh = next; } while (bh != head); folio_unlock(folio); goto done; } EXPORT_SYMBOL(__block_write_full_folio); /* * If a folio has any new buffers, zero them out here, and mark them uptodate * and dirty so they'll be written out (in order to prevent uninitialised * block data from leaking). And clear the new bit. */ void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; struct buffer_head *head, *bh; BUG_ON(!folio_test_locked(folio)); head = folio_buffers(folio); if (!head) return; bh = head; block_start = 0; do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { if (!folio_test_uptodate(folio)) { size_t start, xend; start = max(from, block_start); xend = min(to, block_end); folio_zero_segment(folio, start, xend); set_buffer_uptodate(bh); } clear_buffer_new(bh); mark_buffer_dirty(bh); } } block_start = block_end; bh = bh->b_this_page; } while (bh != head); } EXPORT_SYMBOL(folio_zero_new_buffers); static int iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, const struct iomap *iomap) { loff_t offset = (loff_t)block << inode->i_blkbits; bh->b_bdev = iomap->bdev; /* * Block points to offset in file we need to map, iomap contains * the offset at which the map starts. If the map ends before the * current block, then do not map the buffer and let the caller * handle it. */ if (offset >= iomap->offset + iomap->length) return -EIO; switch (iomap->type) { case IOMAP_HOLE: /* * If the buffer is not up to date or beyond the current EOF, * we need to mark it as new to ensure sub-block zeroing is * executed if necessary. */ if (!buffer_uptodate(bh) || (offset >= i_size_read(inode))) set_buffer_new(bh); return 0; case IOMAP_DELALLOC: if (!buffer_uptodate(bh) || (offset >= i_size_read(inode))) set_buffer_new(bh); set_buffer_uptodate(bh); set_buffer_mapped(bh); set_buffer_delay(bh); return 0; case IOMAP_UNWRITTEN: /* * For unwritten regions, we always need to ensure that regions * in the block we are not writing to are zeroed. Mark the * buffer as new to ensure this. */ set_buffer_new(bh); set_buffer_unwritten(bh); fallthrough; case IOMAP_MAPPED: if ((iomap->flags & IOMAP_F_NEW) || offset >= i_size_read(inode)) { /* * This can happen if truncating the block device races * with the check in the caller as i_size updates on * block devices aren't synchronized by i_rwsem for * block devices. */ if (S_ISBLK(inode->i_mode)) return -EIO; set_buffer_new(bh); } bh->b_blocknr = (iomap->addr + offset - iomap->offset) >> inode->i_blkbits; set_buffer_mapped(bh); return 0; default: WARN_ON_ONCE(1); return -EIO; } } int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap) { size_t from = offset_in_folio(folio, pos); size_t to = from + len; struct inode *inode = folio->mapping->host; size_t block_start, block_end; sector_t block; int err = 0; size_t blocksize; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; BUG_ON(!folio_test_locked(folio)); BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; block = div_u64(folio_pos(folio), blocksize); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } continue; } if (buffer_new(bh)) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); if (get_block) err = get_block(inode, block, bh, 1); else err = iomap_to_bh(inode, block, bh, iomap); if (err) break; if (buffer_new(bh)) { clean_bdev_bh_alias(bh); if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) folio_zero_segments(folio, to, block_end, block_start, from); continue; } } if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { bh_read_nowait(bh, 0); *wait_bh++=bh; } } /* * If we issued read requests - let them complete. */ while(wait_bh > wait) { wait_on_buffer(*--wait_bh); if (!buffer_uptodate(*wait_bh)) err = -EIO; } if (unlikely(err)) folio_zero_new_buffers(folio, from, to); return err; } int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { return __block_write_begin_int(folio, pos, len, get_block, NULL); } EXPORT_SYMBOL(__block_write_begin); void block_commit_write(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; bool partial = false; unsigned blocksize; struct buffer_head *bh, *head; bh = head = folio_buffers(folio); if (!bh) return; blocksize = bh->b_size; block_start = 0; do { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) partial = true; } else { set_buffer_uptodate(bh); mark_buffer_dirty(bh); } if (buffer_new(bh)) clear_buffer_new(bh); block_start = block_end; bh = bh->b_this_page; } while (bh != head); /* * If this is a partial write which happened to make all buffers * uptodate then we can optimize away a bogus read_folio() for * the next read(). Here we 'discover' whether the folio went * uptodate as a result of this (potentially partial) write. */ if (!partial) folio_mark_uptodate(folio); } EXPORT_SYMBOL(block_commit_write); /* * block_write_begin takes care of the basic task of block allocation and * bringing partial write blocks uptodate first. * * The filesystem needs to handle block truncation upon failure. */ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block) { pgoff_t index = pos >> PAGE_SHIFT; struct folio *folio; int status; folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); status = __block_write_begin_int(folio, pos, len, get_block, NULL); if (unlikely(status)) { folio_unlock(folio); folio_put(folio); folio = NULL; } *foliop = folio; return status; } EXPORT_SYMBOL(block_write_begin); int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *folio) { size_t start = pos - folio_pos(folio); if (unlikely(copied < len)) { /* * The buffers that were written will now be uptodate, so * we don't have to worry about a read_folio reading them * and overwriting a partial write. However if we have * encountered a short write and only partially written * into a buffer, it will not be marked uptodate, so a * read_folio might come in and destroy our partial write. * * Do the simplest thing, and just treat any short write to a * non uptodate folio as a zero-length write, and force the * caller to redo the whole thing. */ if (!folio_test_uptodate(folio)) copied = 0; folio_zero_new_buffers(folio, start+copied, start+len); } flush_dcache_folio(folio); /* This could be a short (even 0-length) commit */ block_commit_write(folio, start, start + copied); return copied; } EXPORT_SYMBOL(block_write_end); int generic_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool i_size_changed = false; copied = block_write_end(pos, len, copied, folio); /* * No need to use i_size_read() here, the i_size cannot change under us * because we hold i_rwsem. * * But it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. */ if (pos + copied > inode->i_size) { i_size_write(inode, pos + copied); i_size_changed = true; } folio_unlock(folio); folio_put(folio); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock * ordering of page lock and transaction start for journaling * filesystems. */ if (i_size_changed) mark_inode_dirty(inode); return copied; } EXPORT_SYMBOL(generic_write_end); /* * block_is_partially_uptodate checks whether buffers within a folio are * uptodate or not. * * Returns true if all buffers which correspond to the specified part * of the folio are uptodate. */ bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; bool ret = true; head = folio_buffers(folio); if (!head) return false; blocksize = head->b_size; to = min_t(unsigned, folio_size(folio) - from, count); to = from + to; if (from < blocksize && to > folio_size(folio) - blocksize) return false; bh = head; block_start = 0; do { block_end = block_start + blocksize; if (block_end > from && block_start < to) { if (!buffer_uptodate(bh)) { ret = false; break; } if (block_end >= to) break; } block_start = block_end; bh = bh->b_this_page; } while (bh != head); return ret; } EXPORT_SYMBOL(block_is_partially_uptodate); /* * Generic "read_folio" function for block devices that have the normal * get_block functionality. This is most of the block device filesystems. * Reads the folio asynchronously --- the unlock_buffer() and * set/clear_buffer_uptodate() functions propagate buffer state into the * folio once IO has completed. */ int block_read_full_folio(struct folio *folio, get_block_t *get_block) { struct inode *inode = folio->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *prev = NULL; size_t blocksize; int fully_mapped = 1; bool page_error = false; loff_t limit = i_size_read(inode); /* This is needed for ext4. */ if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) limit = inode->i_sb->s_maxbytes; head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; iblock = div_u64(folio_pos(folio), blocksize); lblock = div_u64(limit + blocksize - 1, blocksize); bh = head; do { if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { int err = 0; fully_mapped = 0; if (iblock < lblock) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); if (err) page_error = true; } if (!buffer_mapped(bh)) { folio_zero_range(folio, bh_offset(bh), blocksize); if (!err) set_buffer_uptodate(bh); continue; } /* * get_block() might have updated the buffer * synchronously */ if (buffer_uptodate(bh)) continue; } lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); continue; } mark_buffer_async_read(bh); if (prev) submit_bh(REQ_OP_READ, prev); prev = bh; } while (iblock++, (bh = bh->b_this_page) != head); if (fully_mapped) folio_set_mappedtodisk(folio); /* * All buffers are uptodate or get_block() returned an error * when trying to map them - we must finish the read because * end_buffer_async_read() will never be called on any buffer * in this folio. */ if (prev) submit_bh(REQ_OP_READ, prev); else folio_end_read(folio, !page_error); return 0; } EXPORT_SYMBOL(block_read_full_folio); /* utility function for filesystems that need to do work on expanding * truncates. Uses filesystem pagecache writes to allow the filesystem to * deal with the hole. */ int generic_cont_expand_simple(struct inode *inode, loff_t size) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; struct folio *folio; void *fsdata = NULL; int err; err = inode_newsize_ok(inode, size); if (err) goto out; err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata); if (err) goto out; err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata); BUG_ON(err > 0); out: return err; } EXPORT_SYMBOL(generic_cont_expand_simple); static int cont_expand_zero(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, loff_t *bytes) { struct inode *inode = mapping->host; const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); struct folio *folio; void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; unsigned zerofrom, offset, len; int err = 0; index = pos >> PAGE_SHIFT; offset = pos & ~PAGE_MASK; while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) { zerofrom = curpos & ~PAGE_MASK; if (zerofrom & (blocksize-1)) { *bytes |= (blocksize-1); (*bytes)++; } len = PAGE_SIZE - zerofrom; err = aops->write_begin(iocb, mapping, curpos, len, &folio, &fsdata); if (err) goto out; folio_zero_range(folio, offset_in_folio(folio, curpos), len); err = aops->write_end(iocb, mapping, curpos, len, len, folio, fsdata); if (err < 0) goto out; BUG_ON(err != len); err = 0; balance_dirty_pages_ratelimited(mapping); if (fatal_signal_pending(current)) { err = -EINTR; goto out; } } /* page covers the boundary, find the boundary offset */ if (index == curidx) { zerofrom = curpos & ~PAGE_MASK; /* if we will expand the thing last block will be filled */ if (offset <= zerofrom) { goto out; } if (zerofrom & (blocksize-1)) { *bytes |= (blocksize-1); (*bytes)++; } len = offset - zerofrom; err = aops->write_begin(iocb, mapping, curpos, len, &folio, &fsdata); if (err) goto out; folio_zero_range(folio, offset_in_folio(folio, curpos), len); err = aops->write_end(iocb, mapping, curpos, len, len, folio, fsdata); if (err < 0) goto out; BUG_ON(err != len); err = 0; } out: return err; } /* * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata, get_block_t *get_block, loff_t *bytes) { struct inode *inode = mapping->host; unsigned int blocksize = i_blocksize(inode); unsigned int zerofrom; int err; err = cont_expand_zero(iocb, mapping, pos, bytes); if (err) return err; zerofrom = *bytes & ~PAGE_MASK; if (pos+len > *bytes && zerofrom & (blocksize-1)) { *bytes |= (blocksize-1); (*bytes)++; } return block_write_begin(mapping, pos, len, foliop, get_block); } EXPORT_SYMBOL(cont_write_begin); /* * block_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must * be careful to check for EOF conditions here. We set the page up correctly * for a written page which means we get ENOSPC checking when writing into * holes and correct delalloc and unwritten extent mapping on filesystems that * support these features. * * We are not allowed to take the i_rwsem here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. * * Direct callers of this function should protect against filesystem freezing * using sb_start_pagefault() - sb_end_pagefault() functions. */ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vma->vm_file); unsigned long end; loff_t size; int ret; folio_lock(folio); size = i_size_read(inode); if ((folio->mapping != inode->i_mapping) || (folio_pos(folio) >= size)) { /* We overload EFAULT to mean page got truncated */ ret = -EFAULT; goto out_unlock; } end = folio_size(folio); /* folio is wholly or partially inside EOF */ if (folio_pos(folio) + end > size) end = size - folio_pos(folio); ret = __block_write_begin_int(folio, 0, end, get_block, NULL); if (unlikely(ret)) goto out_unlock; block_commit_write(folio, 0, end); folio_mark_dirty(folio); folio_wait_stable(folio); return 0; out_unlock: folio_unlock(folio); return ret; } EXPORT_SYMBOL(block_page_mkwrite); int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { pgoff_t index = from >> PAGE_SHIFT; unsigned blocksize; sector_t iblock; size_t offset, length, pos; struct inode *inode = mapping->host; struct folio *folio; struct buffer_head *bh; int err = 0; blocksize = i_blocksize(inode); length = from & (blocksize - 1); /* Block boundary? Nothing to do */ if (!length) return 0; length = blocksize - length; iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits; folio = filemap_grab_folio(mapping, index); if (IS_ERR(folio)) return PTR_ERR(folio); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; iblock++; pos += blocksize; } if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); if (err) goto unlock; /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) goto unlock; } /* Ok, it's mapped. Make sure it's up-to-date */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { err = bh_read(bh, 0); /* Uhhuh. Read error. Complain and punt. */ if (err < 0) goto unlock; } folio_zero_range(folio, offset, length); mark_buffer_dirty(bh); unlock: folio_unlock(folio); folio_put(folio); return err; } EXPORT_SYMBOL(block_truncate_page); /* * The generic write folio function for buffer-backed address_spaces */ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, void *get_block) { struct inode * const inode = folio->mapping->host; loff_t i_size = i_size_read(inode); /* Is the folio fully inside i_size? */ if (folio_next_pos(folio) <= i_size) return __block_write_full_folio(inode, folio, get_block, wbc); /* Is the folio fully outside i_size? (truncate in progress) */ if (folio_pos(folio) >= i_size) { folio_unlock(folio); return 0; /* don't care */ } /* * The folio straddles i_size. It must be zeroed out on each and every * writeback invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ folio_zero_segment(folio, offset_in_folio(folio, i_size), folio_size(folio)); return __block_write_full_folio(inode, folio, get_block, wbc); } sector_t generic_block_bmap(struct address_space *mapping, sector_t block, get_block_t *get_block) { struct inode *inode = mapping->host; struct buffer_head tmp = { .b_size = i_blocksize(inode), }; get_block(inode, block, &tmp, 0); return tmp.b_blocknr; } EXPORT_SYMBOL(generic_block_bmap); static void end_bio_bh_io_sync(struct bio *bio) { struct buffer_head *bh = bio->bi_private; if (unlikely(bio_flagged(bio, BIO_QUIET))) set_bit(BH_Quiet, &bh->b_state); bh->b_end_io(bh, !bio->bi_status); bio_put(bio); } static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, enum rw_hint write_hint, struct writeback_control *wbc) { const enum req_op op = opf & REQ_OP_MASK; struct bio *bio; BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); BUG_ON(!bh->b_end_io); BUG_ON(buffer_delay(bh)); BUG_ON(buffer_unwritten(bh)); /* * Only clear out a write error when rewriting */ if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE)) clear_buffer_write_io_error(bh); if (buffer_meta(bh)) opf |= REQ_META; if (buffer_prio(bh)) opf |= REQ_PRIO; bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO); fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_write_hint = write_hint; bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh)); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; /* Take care of bh's that straddle the end of the device */ guard_bio_eod(bio); if (wbc) { wbc_init_bio(wbc, bio); wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); } submit_bio(bio); } void submit_bh(blk_opf_t opf, struct buffer_head *bh) { submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL); } EXPORT_SYMBOL(submit_bh); void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { lock_buffer(bh); if (!test_clear_buffer_dirty(bh)) { unlock_buffer(bh); return; } bh->b_end_io = end_buffer_write_sync; get_bh(bh); submit_bh(REQ_OP_WRITE | op_flags, bh); } EXPORT_SYMBOL(write_dirty_buffer); /* * For a data-integrity writeout, we need to wait upon any in-progress I/O * and then start new I/O and then wait upon it. The caller must have a ref on * the buffer_head. */ int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags) { WARN_ON(atomic_read(&bh->b_count) < 1); lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { /* * The bh should be mapped, but it might not be if the * device was hot-removed. Not much we can do but fail the I/O. */ if (!buffer_mapped(bh)) { unlock_buffer(bh); return -EIO; } get_bh(bh); bh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | op_flags, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) return -EIO; } else { unlock_buffer(bh); } return 0; } EXPORT_SYMBOL(__sync_dirty_buffer); int sync_dirty_buffer(struct buffer_head *bh) { return __sync_dirty_buffer(bh, REQ_SYNC); } EXPORT_SYMBOL(sync_dirty_buffer); static inline int buffer_busy(struct buffer_head *bh) { return atomic_read(&bh->b_count) | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); } static bool drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free) { struct buffer_head *head = folio_buffers(folio); struct buffer_head *bh; bh = head; do { if (buffer_busy(bh)) goto failed; bh = bh->b_this_page; } while (bh != head); do { struct buffer_head *next = bh->b_this_page; if (bh->b_assoc_map) __remove_assoc_queue(bh); bh = next; } while (bh != head); *buffers_to_free = head; folio_detach_private(folio); return true; failed: return false; } /** * try_to_free_buffers - Release buffers attached to this folio. * @folio: The folio. * * If any buffers are in use (dirty, under writeback, elevated refcount), * no buffers will be freed. * * If the folio is dirty but all the buffers are clean then we need to * be sure to mark the folio clean as well. This is because the folio * may be against a block device, and a later reattachment of buffers * to a dirty folio will set *all* buffers dirty. Which would corrupt * filesystem data on the same device. * * The same applies to regular filesystem folios: if all the buffers are * clean then we set the folio clean and proceed. To do that, we require * total exclusion from block_dirty_folio(). That is obtained with * i_private_lock. * * Exclusion against try_to_free_buffers may be obtained by either * locking the folio or by holding its mapping's i_private_lock. * * Context: Process context. @folio must be locked. Will not sleep. * Return: true if all buffers attached to this folio were freed. */ bool try_to_free_buffers(struct folio *folio) { struct address_space * const mapping = folio->mapping; struct buffer_head *buffers_to_free = NULL; bool ret = 0; BUG_ON(!folio_test_locked(folio)); if (folio_test_writeback(folio)) return false; if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(folio, &buffers_to_free); goto out; } spin_lock(&mapping->i_private_lock); ret = drop_buffers(folio, &buffers_to_free); /* * If the filesystem writes its buffers by hand (eg ext3) * then we can have clean buffers against a dirty folio. We * clean the folio here; otherwise the VM will never notice * that the filesystem did any IO at all. * * Also, during truncate, discard_buffer will have marked all * the folio's buffers clean. We discover that here and clean * the folio also. * * i_private_lock must be held over this entire operation in order * to synchronise against block_dirty_folio and prevent the * dirty bit from being lost. */ if (ret) folio_cancel_dirty(folio); spin_unlock(&mapping->i_private_lock); out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; do { struct buffer_head *next = bh->b_this_page; free_buffer_head(bh); bh = next; } while (bh != buffers_to_free); } return ret; } EXPORT_SYMBOL(try_to_free_buffers); /* * Buffer-head allocation */ static struct kmem_cache *bh_cachep __ro_after_init; /* * Once the number of bh's in the machine exceeds this level, we start * stripping them in writeback. */ static unsigned long max_buffer_heads __ro_after_init; int buffer_heads_over_limit; struct bh_accounting { int nr; /* Number of live bh's */ int ratelimit; /* Limit cacheline bouncing */ }; static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; static void recalc_bh_state(void) { int i; int tot = 0; if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096) return; __this_cpu_write(bh_accounting.ratelimit, 0); for_each_online_cpu(i) tot += per_cpu(bh_accounting, i).nr; buffer_heads_over_limit = (tot > max_buffer_heads); } struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) { struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); spin_lock_init(&ret->b_uptodate_lock); preempt_disable(); __this_cpu_inc(bh_accounting.nr); recalc_bh_state(); preempt_enable(); } return ret; } EXPORT_SYMBOL(alloc_buffer_head); void free_buffer_head(struct buffer_head *bh) { BUG_ON(!list_empty(&bh->b_assoc_buffers)); kmem_cache_free(bh_cachep, bh); preempt_disable(); __this_cpu_dec(bh_accounting.nr); recalc_bh_state(); preempt_enable(); } EXPORT_SYMBOL(free_buffer_head); static int buffer_exit_cpu_dead(unsigned int cpu) { int i; struct bh_lru *b = &per_cpu(bh_lrus, cpu); for (i = 0; i < BH_LRU_SIZE; i++) { brelse(b->bhs[i]); b->bhs[i] = NULL; } this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr); per_cpu(bh_accounting, cpu).nr = 0; return 0; } /** * bh_uptodate_or_lock - Test whether the buffer is uptodate * @bh: struct buffer_head * * Return true if the buffer is up-to-date and false, * with the buffer locked, if not. */ int bh_uptodate_or_lock(struct buffer_head *bh) { if (!buffer_uptodate(bh)) { lock_buffer(bh); if (!buffer_uptodate(bh)) return 0; unlock_buffer(bh); } return 1; } EXPORT_SYMBOL(bh_uptodate_or_lock); /** * __bh_read - Submit read for a locked buffer * @bh: struct buffer_head * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ * @wait: wait until reading finish * * Returns zero on success or don't wait, and -EIO on error. */ int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { int ret = 0; BUG_ON(!buffer_locked(bh)); get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ | op_flags, bh); if (wait) { wait_on_buffer(bh); if (!buffer_uptodate(bh)) ret = -EIO; } return ret; } EXPORT_SYMBOL(__bh_read); /** * __bh_read_batch - Submit read for a batch of unlocked buffers * @nr: entry number of the buffer batch * @bhs: a batch of struct buffer_head * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ * @force_lock: force to get a lock on the buffer if set, otherwise drops any * buffer that cannot lock. * * Returns zero on success or don't wait, and -EIO on error. */ void __bh_read_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags, bool force_lock) { int i; for (i = 0; i < nr; i++) { struct buffer_head *bh = bhs[i]; if (buffer_uptodate(bh)) continue; if (force_lock) lock_buffer(bh); else if (!trylock_buffer(bh)) continue; if (buffer_uptodate(bh)) { unlock_buffer(bh); continue; } bh->b_end_io = end_buffer_read_sync; get_bh(bh); submit_bh(REQ_OP_READ | op_flags, bh); } } EXPORT_SYMBOL(__bh_read_batch); void __init buffer_init(void) { unsigned long nrpages; int ret; bh_cachep = KMEM_CACHE(buffer_head, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC); /* * Limit the bh occupancy to 10% of ZONE_NORMAL */ nrpages = (nr_free_buffer_pages() * 10) / 100; max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead", NULL, buffer_exit_cpu_dead); WARN_ON(ret < 0); }
3988 66 2566 116 233 166 3580 2842 52 2928 3613 1 8 781 1220 12 1066 1219 1042 10 1042 253 6 218 23 111 886 40 875 254 12 434 12 20 1004 678 24 43 22 227 16 1 187 393 371 875 304 309 6 14 13 1359 728 1316 116 250 186 166 34 61 37 1077 12 14 1077 29 9 56 1 11 549 975 536 10 740 11 11 10 11 11 81 81 10 81 697 697 697 53 53 53 4 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 /* SPDX-License-Identifier: GPL-2.0 */ /* * Macros for manipulating and testing page->flags */ #ifndef PAGE_FLAGS_H #define PAGE_FLAGS_H #include <linux/types.h> #include <linux/bug.h> #include <linux/mmdebug.h> #ifndef __GENERATING_BOUNDS_H #include <linux/mm_types.h> #include <generated/bounds.h> #endif /* !__GENERATING_BOUNDS_H */ /* * Various page->flags bits: * * PG_reserved is set for special pages. The "struct page" of such a page * should in general not be touched (e.g. set dirty) except by its owner. * Pages marked as PG_reserved include: * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, * initrd, HW tables) * - Pages reserved or allocated early during boot (before the page allocator * was initialized). This includes (depending on the architecture) the * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much * much more. Once (if ever) freed, PG_reserved is cleared and they will * be given to the page allocator. * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) * Some PG_reserved pages will be excluded from the hibernation image. * PG_reserved does in general not hinder anybody from dumping or swapping * and is no longer required for remap_pfn_range(). ioremap might require it. * Consequently, PG_reserved for a page mapped into user space can indicate * the zero page, the vDSO, MMIO pages or device memory. * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by * private allocations for its own usage. * * During initiation of disk I/O, PG_locked is set. This bit is set before I/O * and cleared when writeback _starts_ or when read _completes_. PG_writeback * is set before writeback starts and cleared when it finishes. * * PG_locked also pins a page in pagecache, and blocks truncation of the file * while it is held. * * page_waitqueue(page) is a wait queue of all tasks waiting for the page * to become unlocked. * * PG_swapbacked is set when a page uses swap as a backing storage. This are * usually PageAnon or shmem pages but please note that even anonymous pages * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as * a result of MADV_FREE). * * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * * PG_arch_1 is an architecture specific page state bit. The generic code * guarantees that this bit is cleared for a page when it first is entered into * the page cache. * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! */ /* * Don't use the pageflags directly. Use the PageFoo macros. * * The page flags field is split into two parts, the main flags area * which extends from the low bits upwards, and the fields area which * extends from the high bits downwards. * * | FIELD | ... | FLAGS | * N-1 ^ 0 * (NR_PAGEFLAGS) * * The fields area is reserved for fields mapping zone, node (for NUMA) and * SPARSEMEM section (for variants of SPARSEMEM that require section ids like * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP). */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ PG_writeback, /* Page is under writeback */ PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_head, /* Must be in bit 6 */ PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, PG_owner_priv_1, /* Owner use. If pagecache, fs may use */ PG_owner_2, /* Owner use. If pagecache, fs may use */ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_2 PG_arch_2, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_3 PG_arch_3, #endif __NR_PAGEFLAGS, PG_readahead = PG_reclaim, /* Anonymous memory (and shmem) */ PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ /* Some filesystems */ PG_checked = PG_owner_priv_1, /* * Depending on the way an anonymous folio can be mapped into a page * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped * THP), PG_anon_exclusive may be set only for the head page or for * tail pages of an anonymous folio. For now, we only expect it to be * set on tail pages for PTE-mapped THP. */ PG_anon_exclusive = PG_owner_2, /* * Set if all buffer heads in the folio are mapped. * Filesystems which do not use BHs can use it for their own purpose. */ PG_mappedtodisk = PG_owner_2, /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes * when those inodes are being locally cached. */ PG_fscache = PG_private_2, /* page backed by cache */ /* XEN */ /* Pinned in Xen as a read-only pagetable page. */ PG_pinned = PG_owner_priv_1, /* Pinned as part of domain save (see xen_mm_pin_all()). */ PG_savepinned = PG_dirty, /* Has a grant mapping of another (foreign) domain's page. */ PG_foreign = PG_owner_priv_1, /* Remapped by swiotlb-xen. */ PG_xen_remapped = PG_owner_priv_1, #ifdef CONFIG_MIGRATION /* movable_ops page that is isolated for migration */ PG_movable_ops_isolated = PG_reclaim, /* this is a movable_ops page (for selected typed pages only) */ PG_movable_ops = PG_uptodate, #endif /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, #ifdef CONFIG_MEMORY_HOTPLUG /* For self-hosted memmap pages */ PG_vmemmap_self_hosted = PG_owner_priv_1, #endif /* * Flags only valid for compound pages. Stored in first tail page's * flags word. Cannot use the first 8 flags or any flag marked as * PF_ANY. */ /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); /* * Return the real head page struct iff the @page is a fake head page, otherwise * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; /* * Only addresses aligned with PAGE_SIZE of struct page may be fake head * struct page. The alignment check aims to avoid access the fields ( * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && test_bit(PG_head, &page->flags.f)) { /* * We can safely access the field of the @page[1] with PG_head * because the @page is a compound page composed with at least * two contiguous pages. */ unsigned long head = READ_ONCE(page[1].compound_head); if (likely(head & 1)) return (const struct page *)(head - 1); } return page; } static __always_inline bool page_count_writable(const struct page *page, int u) { if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return true; /* * The refcount check is ordered before the fake-head check to prevent * the following race: * CPU 1 (HVO) CPU 2 (speculative PFN walker) * * page_ref_freeze() * synchronize_rcu() * rcu_read_lock() * page_is_fake_head() is false * vmemmap_remap_pte() * XXX: struct page[] becomes r/o * * page_ref_unfreeze() * page_ref_count() is not zero * * atomic_add_unless(&page->_refcount) * XXX: try to modify r/o struct page[] * * The refcount check also prevents modification attempts to other (r/o) * tail pages that are not fake heads. */ if (atomic_read_acquire(&page->_refcount) == u) return false; return page_fixed_fake_head(page) == page; } #else static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } static inline bool page_count_writable(const struct page *page, int u) { return true; } #endif static __always_inline int page_is_fake_head(const struct page *page) { return page_fixed_fake_head(page) != page; } static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) /** * page_folio - Converts from page to folio. * @p: The page. * * Every page is part of a folio. This function cannot be called on a * NULL pointer. * * Context: No reference, nor lock is required on @page. If the caller * does not hold a reference, this call may race with a folio split, so * it should re-check the folio still contains this page after gaining * a reference on the folio. * Return: The folio which contains this page. */ #define page_folio(p) (_Generic((p), \ const struct page *: (const struct folio *)_compound_head(p), \ struct page *: (struct folio *)_compound_head(p))) /** * folio_page - Return a page from a folio. * @folio: The folio. * @n: The page number to return. * * @n is relative to the start of the folio. This function does not * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ #define folio_page(folio, n) (&(folio)->page + (n)) static __always_inline int PageTail(const struct page *page) { return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags.f) || READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM void page_init_poison(struct page *page, size_t size); #else static inline void page_init_poison(struct page *page, size_t size) { } #endif static const unsigned long *const_folio_flags(const struct folio *folio, unsigned n) { const struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } /* * Page flags policies wrt compound pages * * PF_POISONED_CHECK * check if this struct page poisoned/uninitialized * * PF_ANY: * the page flag is relevant for small, head and tail pages. * * PF_HEAD: * for compound page all operations related to the page flag applied to * head page. * * PF_NO_TAIL: * modifications of the page flag must be done on small or head pages, * checks can be done on tail pages too. * * PF_NO_COMPOUND: * the page flag is not relevant for compound pages. * * PF_SECOND: * the page flag is stored in the first tail page. */ #define PF_POISONED_CHECK(page) ({ \ VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ page; }) #define PF_ANY(page, enforce) PF_POISONED_CHECK(page) #define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) #define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ PF_POISONED_CHECK(page); }) #define PF_SECOND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) /* Which page is the flag stored in */ #define FOLIO_PF_ANY 0 #define FOLIO_PF_HEAD 0 #define FOLIO_PF_NO_TAIL 0 #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 #define FOLIO_HEAD_PAGE 0 #define FOLIO_SECOND_PAGE 1 /* * Macros to create function definitions for page flags */ #define FOLIO_TEST_FLAG(name, page) \ static __always_inline bool folio_test_##name(const struct folio *folio) \ { return test_bit(PG_##name, const_folio_flags(folio, page)); } #define FOLIO_SET_FLAG(name, page) \ static __always_inline void folio_set_##name(struct folio *folio) \ { set_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_CLEAR_FLAG(name, page) \ static __always_inline void folio_clear_##name(struct folio *folio) \ { clear_bit(PG_##name, folio_flags(folio, page)); } #define __FOLIO_SET_FLAG(name, page) \ static __always_inline void __folio_set_##name(struct folio *folio) \ { __set_bit(PG_##name, folio_flags(folio, page)); } #define __FOLIO_CLEAR_FLAG(name, page) \ static __always_inline void __folio_clear_##name(struct folio *folio) \ { __clear_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_TEST_SET_FLAG(name, page) \ static __always_inline bool folio_test_set_##name(struct folio *folio) \ { return test_and_set_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_TEST_CLEAR_FLAG(name, page) \ static __always_inline bool folio_test_clear_##name(struct folio *folio) \ { return test_and_clear_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_FLAG(name, page) \ FOLIO_TEST_FLAG(name, page) \ FOLIO_SET_FLAG(name, page) \ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(const struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags.f); } #define SETPAGEFLAG(uname, lname, policy) \ FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define CLEARPAGEFLAG(uname, lname, policy) \ FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __SETPAGEFLAG(uname, lname, policy) \ __FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __CLEARPAGEFLAG(uname, lname, policy) \ __FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTSETFLAG(uname, lname, policy) \ FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTCLEARFLAG(uname, lname, policy) \ FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ SETPAGEFLAG(uname, lname, policy) \ CLEARPAGEFLAG(uname, lname, policy) #define __PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ __SETPAGEFLAG(uname, lname, policy) \ __CLEARPAGEFLAG(uname, lname, policy) #define TESTSCFLAG(uname, lname, policy) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) #define FOLIO_TEST_FLAG_FALSE(name) \ static inline bool folio_test_##name(const struct folio *folio) \ { return false; } #define FOLIO_SET_FLAG_NOOP(name) \ static inline void folio_set_##name(struct folio *folio) { } #define FOLIO_CLEAR_FLAG_NOOP(name) \ static inline void folio_clear_##name(struct folio *folio) { } #define __FOLIO_SET_FLAG_NOOP(name) \ static inline void __folio_set_##name(struct folio *folio) { } #define __FOLIO_CLEAR_FLAG_NOOP(name) \ static inline void __folio_clear_##name(struct folio *folio) { } #define FOLIO_TEST_SET_FLAG_FALSE(name) \ static inline bool folio_test_set_##name(struct folio *folio) \ { return false; } #define FOLIO_TEST_CLEAR_FLAG_FALSE(name) \ static inline bool folio_test_clear_##name(struct folio *folio) \ { return false; } #define FOLIO_FLAG_FALSE(name) \ FOLIO_TEST_FLAG_FALSE(name) \ FOLIO_SET_FLAG_NOOP(name) \ FOLIO_CLEAR_FLAG_NOOP(name) #define TESTPAGEFLAG_FALSE(uname, lname) \ FOLIO_TEST_FLAG_FALSE(lname) \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ FOLIO_SET_FLAG_NOOP(lname) \ static inline void SetPage##uname(struct page *page) { } #define CLEARPAGEFLAG_NOOP(uname, lname) \ FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname, lname) \ __FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void __ClearPage##uname(struct page *page) { } #define TESTSETFLAG_FALSE(uname, lname) \ FOLIO_TEST_SET_FLAG_FALSE(lname) \ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname, lname) \ FOLIO_TEST_CLEAR_FLAG_FALSE(lname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname) #define TESTSCFLAG_FALSE(uname, lname) \ TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE) PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) TESTCLEARFLAG(LRU, lru, PF_HEAD) FOLIO_FLAG(active, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ /* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause release_folio() and co to be invoked */ PAGEFLAG(Private, private, PF_ANY) FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE) /* owner_2 can be set on tail pages for anon memory */ FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) TESTSCFLAG(Writeback, writeback, PF_NO_TAIL) FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f)) #else PAGEFLAG_FALSE(HighMem, highmem) #endif #define PhysHighMem(__p) (PageHighMem(phys_to_page(__p))) /* Does kmap_local_folio() only allow access to one page of the folio? */ #ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP #define folio_test_partial_kmap(f) true #else #define folio_test_partial_kmap(f) folio_test_highmem(f) #endif #ifdef CONFIG_SWAP static __always_inline bool folio_test_swapcache(const struct folio *folio) { return folio_test_swapbacked(folio) && test_bit(PG_swapcache, const_folio_flags(folio, 0)); } FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE) FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE) #else FOLIO_FLAG_FALSE(swapcache) #endif FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) #ifdef CONFIG_MMU FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE) #else FOLIO_FLAG_FALSE(mlocked) __FOLIO_CLEAR_FLAG_NOOP(mlocked) FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked) FOLIO_TEST_SET_FLAG_FALSE(mlocked) #endif #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 #endif #ifdef CONFIG_PAGE_IDLE_FLAG #ifdef CONFIG_64BIT FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_FLAG(idle, FOLIO_HEAD_PAGE) #endif /* See page_idle.h for !64BIT workaround */ #else /* !CONFIG_PAGE_IDLE_FLAG */ FOLIO_FLAG_FALSE(young) FOLIO_TEST_CLEAR_FLAG_FALSE(young) FOLIO_FLAG_FALSE(idle) #endif /* * PageReported() is used to track reported free pages within the Buddy * allocator. We can use the non-atomic version of the test and set * operations as both should be shielded with the zone lock to prevent * any possible races on the setting or clearing of the bit. */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) #ifdef CONFIG_MEMORY_HOTPLUG PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) #else PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif /* * On an anonymous folio mapped into a user virtual memory area, * folio->mapping points to its anon_vma, not to a struct address_space; * with the FOLIO_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous folio in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the FOLIO_MAPPING_ANON_KSM bit may be set along with the FOLIO_MAPPING_ANON * bit; and then folio->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged folio. See ksm.h. * * Please note that, confusingly, "folio_mapping" refers to the inode * address_space which maps the folio from disk; whereas "folio_mapped" * refers to user virtual address space into which the folio is mapped. * * For slab pages, since slab reuses the bits in struct page to store its * internal states, the folio->mapping does not exist as such, nor do * these flags below. So in order to avoid testing non-existent bits, * please make sure that folio_test_slab(folio) actually evaluates to * false before calling the following functions (e.g., folio_test_anon). * See mm/slab.h. */ #define FOLIO_MAPPING_ANON 0x1 #define FOLIO_MAPPING_ANON_KSM 0x2 #define FOLIO_MAPPING_KSM (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM) #define FOLIO_MAPPING_FLAGS (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM) static __always_inline bool folio_test_anon(const struct folio *folio) { return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0; } static __always_inline bool PageAnonNotKsm(const struct page *page) { unsigned long flags = (unsigned long)page_folio(page)->mapping; return (flags & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_ANON; } static __always_inline bool PageAnon(const struct page *page) { return folio_test_anon(page_folio(page)); } #ifdef CONFIG_KSM /* * A KSM page is one of those write-protected "shared pages" or "merged pages" * which KSM maps into multiple mms, wherever identical anonymous page content * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ static __always_inline bool folio_test_ksm(const struct folio *folio) { return ((unsigned long)folio->mapping & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_KSM; } #else FOLIO_TEST_FLAG_FALSE(ksm) #endif u64 stable_page_flags(const struct page *page); /** * folio_xor_flags_has_waiters - Change some folio flags. * @folio: The folio. * @mask: Bits set in this word will be changed. * * This must only be used for flags which are changed with the folio * lock held. For example, it is unsafe to use for PG_dirty as that * can be set without the folio lock held. It can also only be used * on flags which are in the range 0-6 as some of the implementations * only affect those bits. * * Return: Whether there are tasks waiting on the folio. */ static inline bool folio_xor_flags_has_waiters(struct folio *folio, unsigned long mask) { return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0)); } /** * folio_test_uptodate - Is this folio up to date? * @folio: The folio. * * The uptodate flag is set on a folio when every byte in the folio is * at least as new as the corresponding bytes on storage. Anonymous * and CoW folios are always uptodate. If the folio is not uptodate, * some of the bytes in it may be; see the is_partially_uptodate() * address_space operation. */ static inline bool folio_test_uptodate(const struct folio *folio) { bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0)); /* * Must ensure that the data we read out of the folio is loaded * _after_ we've loaded folio->flags to check the uptodate bit. * We can skip the barrier if the folio is not uptodate, because * we wouldn't be reading anything from it. * * See folio_mark_uptodate() for the other side of the story. */ if (ret) smp_rmb(); return ret; } static inline bool PageUptodate(const struct page *page) { return folio_test_uptodate(page_folio(page)); } static __always_inline void __folio_mark_uptodate(struct folio *folio) { smp_wmb(); __set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void folio_mark_uptodate(struct folio *folio) { /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the folio * uptodate are actually visible before folio_test_uptodate becomes true. */ smp_wmb(); set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void __SetPageUptodate(struct page *page) { __folio_mark_uptodate((struct folio *)page); } static __always_inline void SetPageUptodate(struct page *page) { folio_mark_uptodate((struct folio *)page); } CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) void __folio_start_writeback(struct folio *folio, bool keep_write); void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) static __always_inline bool folio_test_head(const struct folio *folio) { return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY)); } static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); } __SETPAGEFLAG(Head, head, PF_ANY) __CLEARPAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) /** * folio_test_large() - Does this folio contain more than one page? * @folio: The folio to test. * * Return: True if the folio is larger than one page. */ static inline bool folio_test_large(const struct folio *folio) { return folio_test_head(folio); } static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) { WRITE_ONCE(page->compound_head, 0); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { BUG_ON(!PageHead(page)); ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } #else TESTPAGEFLAG_FALSE(TransCompound, transcompound) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the * compound page. * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* * For pages that do not use mapcount, page_type may be used. * The low 24 bits of pagetype may be used for your own purposes, as long * as you are careful to not affect the top 8 bits. The low bits of * pagetype will be overwritten when you clear the page_type from the page. */ enum pagetype { /* 0x00-0x7f are positive numbers, ie mapcount */ /* Reserve 0x80-0xef for mapcount overflow. */ PGTY_buddy = 0xf0, PGTY_offline = 0xf1, PGTY_table = 0xf2, PGTY_guard = 0xf3, PGTY_hugetlb = 0xf4, PGTY_slab = 0xf5, PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_large_kmalloc = 0xf8, PGTY_mapcount_underflow = 0xff }; static inline bool page_type_has_type(int page_type) { return page_type < (PGTY_mapcount_underflow << 24); } /* This takes a mapcount which is one more than page->_mapcount */ static inline bool page_mapcount_is_type(unsigned int mapcount) { return page_type_has_type(mapcount - 1); } static inline bool page_has_type(const struct page *page) { return page_type_has_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ static __always_inline bool folio_test_##fname(const struct folio *folio) \ { \ return data_race(folio->page.page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ if (folio_test_##fname(folio)) \ return; \ VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ folio); \ folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ if (folio->page.page_type == UINT_MAX) \ return; \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ folio->page.page_type = UINT_MAX; \ } #define PAGE_TYPE_OPS(uname, lname, fname) \ FOLIO_TYPE_OPS(lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ return data_race(page->page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ if (Page##uname(page)) \ return; \ VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ if (page->page_type == UINT_MAX) \ return; \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type = UINT_MAX; \ } /* * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ PAGE_TYPE_OPS(Buddy, buddy, buddy) /* * PageOffline() indicates that the page is logically offline although the * containing section is online. (e.g. inflated in a balloon driver or * not onlined when onlining the section). * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * * When a memory block gets onlined, all pages are initialized with a * refcount of 1 and PageOffline(). generic_online_page() will * take care of clearing PageOffline(). * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() * pages (now with a reference count of zero) are treated like free (unmanaged) * pages, allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will * require not giving them to the buddy via generic_online_page(). * * Memory offlining code will not adjust the managed page count for any * PageOffline() pages, treating them like they were never exposed to the * buddy using generic_online_page(). * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random * pages should check PageOffline() and synchronize with such drivers using * page_offline_freeze()/page_offline_thaw(). */ PAGE_TYPE_OPS(Offline, offline, offline) extern void page_offline_freeze(void); extern void page_offline_thaw(void); extern void page_offline_begin(void); extern void page_offline_end(void); /* * Marks pages in use as page tables. */ PAGE_TYPE_OPS(Table, table, pgtable) /* * Marks guardpages used with debug_pagealloc. */ PAGE_TYPE_OPS(Guard, guard, guard) PAGE_TYPE_OPS(Slab, slab, slab) #ifdef CONFIG_HUGETLB_PAGE FOLIO_TYPE_OPS(hugetlb, hugetlb) #else FOLIO_TEST_FLAG_FALSE(hugetlb) #endif PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) /* * Mark pages that has to be accepted before touched for the first time. * * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. * * Context: Any context. * Return: True for hugetlbfs pages, false for anon pages or pages * belonging to other filesystems. */ static inline bool PageHuge(const struct page *page) { return folio_test_hugetlb(page_folio(page)); } /* * Check if a page is currently marked HWPoisoned. Note that this check is * best effort only and inherently racy: there is no way to synchronize with * failing hardware. */ static inline bool is_page_hwpoison(const struct page *page) { const struct folio *folio; if (PageHWPoison(page)) return true; folio = page_folio(page); return folio_test_hugetlb(folio) && PageHWPoison(&folio->page); } static inline bool folio_contain_hwpoisoned_page(struct folio *folio) { return folio_test_hwpoison(folio) || (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)); } bool is_free_buddy_page(const struct page *page); #ifdef CONFIG_MIGRATION /* * This page is migratable through movable_ops (for selected typed pages * only). * * Page migration of such pages might fail, for example, if the page is * already isolated by somebody else, or if the page is about to get freed. * * While a subsystem might set selected typed pages that support page migration * as being movable through movable_ops, it must never clear this flag. * * This flag is only cleared when the page is freed back to the buddy. * * Only selected page types support this flag (see page_movable_ops()) and * the flag might be used in other context for other pages. Always use * page_has_movable_ops() instead. */ TESTPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL); SETPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL); /* * A movable_ops page has this flag set while it is isolated for migration. * This flag primarily protects against concurrent migration attempts. * * Once migration ended (success or failure), the flag is cleared. The * flag is managed by the migration core. */ PAGEFLAG(MovableOpsIsolated, movable_ops_isolated, PF_NO_TAIL); #else /* !CONFIG_MIGRATION */ TESTPAGEFLAG_FALSE(MovableOps, movable_ops); SETPAGEFLAG_NOOP(MovableOps, movable_ops); PAGEFLAG_FALSE(MovableOpsIsolated, movable_ops_isolated); #endif /* CONFIG_MIGRATION */ /** * page_has_movable_ops - test for a movable_ops page * @page: The page to test. * * Test whether this is a movable_ops page. Such pages will stay that * way until freed. * * Returns true if this is a movable_ops page, otherwise false. */ static inline bool page_has_movable_ops(const struct page *page) { return PageMovableOps(page) && (PageOffline(page) || PageZsmalloc(page)); } static __always_inline int PageAnonExclusive(const struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); /* * HugeTLB stores this information on the head page; THP keeps it per * page */ if (PageHuge(page)) page = compound_head(page); return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void SetPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void __ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } #ifdef CONFIG_MMU #define __PG_MLOCKED (1UL << PG_mlocked) #else #define __PG_MLOCKED 0 #endif /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. If they are, there is a problem. */ #define PAGE_FLAGS_CHECK_AT_FREE \ (1UL << PG_lru | 1UL << PG_locked | \ 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_active | \ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. * Pages being prepped should not have these flags set. If they are set, * there has been a kernel bug or struct page corruption. * * __PG_HWPOISON is exceptional because it needs to be kept beyond page's * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) /* * Flags stored in the second page of a compound page. They may overlap * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** * folio_has_private - Determine if folio has private stuff * @folio: The folio to be checked * * Determine if a folio has private stuff, indicating that release routines * should be invoked upon it. */ static inline int folio_has_private(const struct folio *folio) { return !!(folio->flags.f & PAGE_FLAGS_PRIVATE); } #undef PF_ANY #undef PF_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND #undef PF_SECOND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2019 Hangzhou C-SKY Microsystems co.,ltd. */ #include <linux/perf_event.h> #include <linux/uaccess.h> #include <asm/stacktrace.h> static bool fill_callchain(void *entry, unsigned long pc) { return perf_callchain_store(entry, pc) == 0; } /* * This will be called when the target is in user mode * This function will only be called when we use * "PERF_SAMPLE_CALLCHAIN" in * kernel/events/core.c:perf_prepare_sample() * * How to trigger perf_callchain_[user/kernel] : * $ perf record -e cpu-clock --call-graph fp ./program * $ perf report --call-graph * * On RISC-V platform, the program being sampled and the C library * need to be compiled with -fno-omit-frame-pointer, otherwise * the user stack will not contain function frame. */ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } arch_stack_walk_user(fill_callchain, entry, regs); } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } walk_stackframe(NULL, regs, fill_callchain, entry); }
153 122 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 // SPDX-License-Identifier: GPL-2.0 /* * All the USB notify logic * * (C) Copyright 2005 Greg Kroah-Hartman <gregkh@suse.de> * * notifier functions originally based on those in kernel/sys.c * but fixed up to not be so broken. * * Released under the GPLv2 only. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/usb.h> #include <linux/mutex.h> #include "usb.h" static BLOCKING_NOTIFIER_HEAD(usb_notifier_list); /** * usb_register_notify - register a notifier callback whenever a usb change happens * @nb: pointer to the notifier block for the callback events. * * These changes are either USB devices or busses being added or removed. */ void usb_register_notify(struct notifier_block *nb) { blocking_notifier_chain_register(&usb_notifier_list, nb); } EXPORT_SYMBOL_GPL(usb_register_notify); /** * usb_unregister_notify - unregister a notifier callback * @nb: pointer to the notifier block for the callback events. * * usb_register_notify() must have been previously called for this function * to work properly. */ void usb_unregister_notify(struct notifier_block *nb) { blocking_notifier_chain_unregister(&usb_notifier_list, nb); } EXPORT_SYMBOL_GPL(usb_unregister_notify); void usb_notify_add_device(struct usb_device *udev) { blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev); } void usb_notify_remove_device(struct usb_device *udev) { blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_REMOVE, udev); } void usb_notify_add_bus(struct usb_bus *ubus) { blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus); } void usb_notify_remove_bus(struct usb_bus *ubus) { blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus); }
38 144 32 34 99 102 102 102 102 101 102 102 102 184 185 187 15 162 164 186 184 185 6 142 187 142 144 143 142 140 144 142 5 5 5 5 5 5 5 1 1 1 2 1 1 1 2 102 98 101 71 141 139 139 72 2 74 138 140 140 1 141 73 1 139 3 140 141 141 4 139 140 140 136 138 11 12 141 140 141 45 44 38 37 38 38 38 32 31 29 35 42 29 141 138 7 140 140 141 141 141 1 141 141 137 58 59 6 2 57 57 57 57 1 1 6 2 2 1 13 12 1 11 1 10 9 4 6 6 5 5 1 1 11 13 48 7 6 48 2 2 2 2 2 1 1 2 2 2 2 2 11 11 10 11 11 79 77 24 78 3 2 2 2 2 2 2 3 79 80 79 78 79 76 79 79 21 11 62 62 62 79 79 20 17 20 80 25 79 21 20 19 19 19 18 19 20 3 2 2 4 2 2 1 6 3 2 1 2 3 4 3 2 2 2 2 2 1 1 2 3 2 2 1 2 2 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 // SPDX-License-Identifier: GPL-2.0-or-later /* Keyring handling * * Copyright (C) 2004-2005, 2008, 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/err.h> #include <linux/user_namespace.h> #include <linux/nsproxy.h> #include <keys/keyring-type.h> #include <keys/user-type.h> #include <linux/assoc_array_priv.h> #include <linux/uaccess.h> #include <net/net_namespace.h> #include "internal.h" /* * When plumbing the depths of the key tree, this sets a hard limit * set on how deep we're willing to go. */ #define KEYRING_SEARCH_MAX_DEPTH 6 /* * We mark pointers we pass to the associative array with bit 1 set if * they're keyrings and clear otherwise. */ #define KEYRING_PTR_SUBTYPE 0x2UL static inline bool keyring_ptr_is_keyring(const struct assoc_array_ptr *x) { return (unsigned long)x & KEYRING_PTR_SUBTYPE; } static inline struct key *keyring_ptr_to_key(const struct assoc_array_ptr *x) { void *object = assoc_array_ptr_to_leaf(x); return (struct key *)((unsigned long)object & ~KEYRING_PTR_SUBTYPE); } static inline void *keyring_key_to_ptr(struct key *key) { if (key->type == &key_type_keyring) return (void *)((unsigned long)key | KEYRING_PTR_SUBTYPE); return key; } static DEFINE_RWLOCK(keyring_name_lock); /* * Clean up the bits of user_namespace that belong to us. */ void key_free_user_ns(struct user_namespace *ns) { write_lock(&keyring_name_lock); list_del_init(&ns->keyring_name_list); write_unlock(&keyring_name_lock); key_put(ns->user_keyring_register); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif } /* * The keyring key type definition. Keyrings are simply keys of this type and * can be treated as ordinary keys in addition to having their own special * operations. */ static int keyring_preparse(struct key_preparsed_payload *prep); static void keyring_free_preparse(struct key_preparsed_payload *prep); static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep); static void keyring_revoke(struct key *keyring); static void keyring_destroy(struct key *keyring); static void keyring_describe(const struct key *keyring, struct seq_file *m); static long keyring_read(const struct key *keyring, char *buffer, size_t buflen); struct key_type key_type_keyring = { .name = "keyring", .def_datalen = 0, .preparse = keyring_preparse, .free_preparse = keyring_free_preparse, .instantiate = keyring_instantiate, .revoke = keyring_revoke, .destroy = keyring_destroy, .describe = keyring_describe, .read = keyring_read, }; EXPORT_SYMBOL(key_type_keyring); /* * Semaphore to serialise link/link calls to prevent two link calls in parallel * introducing a cycle. */ static DEFINE_MUTEX(keyring_serialise_link_lock); /* * Publish the name of a keyring so that it can be found by name (if it has * one and it doesn't begin with a dot). */ static void keyring_publish_name(struct key *keyring) { struct user_namespace *ns = current_user_ns(); if (keyring->description && keyring->description[0] && keyring->description[0] != '.') { write_lock(&keyring_name_lock); list_add_tail(&keyring->name_link, &ns->keyring_name_list); write_unlock(&keyring_name_lock); } } /* * Preparse a keyring payload */ static int keyring_preparse(struct key_preparsed_payload *prep) { return prep->datalen != 0 ? -EINVAL : 0; } /* * Free a preparse of a user defined key payload */ static void keyring_free_preparse(struct key_preparsed_payload *prep) { } /* * Initialise a keyring. * * Returns 0 on success, -EINVAL if given any data. */ static int keyring_instantiate(struct key *keyring, struct key_preparsed_payload *prep) { assoc_array_init(&keyring->keys); /* make the keyring available by name if it has one */ keyring_publish_name(keyring); return 0; } /* * Multiply 64-bits by 32-bits to 96-bits and fold back to 64-bit. Ideally we'd * fold the carry back too, but that requires inline asm. */ static u64 mult_64x32_and_fold(u64 x, u32 y) { u64 hi = (u64)(u32)(x >> 32) * y; u64 lo = (u64)(u32)(x) * y; return lo + ((u64)(u32)hi << 32) + (u32)(hi >> 32); } /* * Hash a key type and description. */ static void hash_key_type_and_desc(struct keyring_index_key *index_key) { const unsigned level_shift = ASSOC_ARRAY_LEVEL_STEP; const unsigned long fan_mask = ASSOC_ARRAY_FAN_MASK; const char *description = index_key->description; unsigned long hash, type; u32 piece; u64 acc; int n, desc_len = index_key->desc_len; type = (unsigned long)index_key->type; acc = mult_64x32_and_fold(type, desc_len + 13); acc = mult_64x32_and_fold(acc, 9207); piece = (unsigned long)index_key->domain_tag; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); for (;;) { n = desc_len; if (n <= 0) break; if (n > 4) n = 4; piece = 0; memcpy(&piece, description, n); description += n; desc_len -= n; acc = mult_64x32_and_fold(acc, piece); acc = mult_64x32_and_fold(acc, 9207); } /* Fold the hash down to 32 bits if need be. */ hash = acc; if (ASSOC_ARRAY_KEY_CHUNK_SIZE == 32) hash ^= acc >> 32; /* Squidge all the keyrings into a separate part of the tree to * ordinary keys by making sure the lowest level segment in the hash is * zero for keyrings and non-zero otherwise. */ if (index_key->type != &key_type_keyring && (hash & fan_mask) == 0) hash |= (hash >> (ASSOC_ARRAY_KEY_CHUNK_SIZE - level_shift)) | 1; else if (index_key->type == &key_type_keyring && (hash & fan_mask) != 0) hash = (hash + (hash << level_shift)) & ~fan_mask; index_key->hash = hash; } /* * Finalise an index key to include a part of the description actually in the * index key, to set the domain tag and to calculate the hash. */ void key_set_index_key(struct keyring_index_key *index_key) { static struct key_tag default_domain_tag = { .usage = REFCOUNT_INIT(1), }; size_t n = min_t(size_t, index_key->desc_len, sizeof(index_key->desc)); memcpy(index_key->desc, index_key->description, n); if (!index_key->domain_tag) { if (index_key->type->flags & KEY_TYPE_NET_DOMAIN) index_key->domain_tag = current->nsproxy->net_ns->key_domain; else index_key->domain_tag = &default_domain_tag; } hash_key_type_and_desc(index_key); } /** * key_put_tag - Release a ref on a tag. * @tag: The tag to release. * * This releases a reference the given tag and returns true if that ref was the * last one. */ bool key_put_tag(struct key_tag *tag) { if (refcount_dec_and_test(&tag->usage)) { kfree_rcu(tag, rcu); return true; } return false; } /** * key_remove_domain - Kill off a key domain and gc its keys * @domain_tag: The domain tag to release. * * This marks a domain tag as being dead and releases a ref on it. If that * wasn't the last reference, the garbage collector is poked to try and delete * all keys that were in the domain. */ void key_remove_domain(struct key_tag *domain_tag) { domain_tag->removed = true; if (!key_put_tag(domain_tag)) key_schedule_gc_links(); } /* * Build the next index key chunk. * * We return it one word-sized chunk at a time. */ static unsigned long keyring_get_key_chunk(const void *data, int level) { const struct keyring_index_key *index_key = data; unsigned long chunk = 0; const u8 *d; int desc_len = index_key->desc_len, n = sizeof(chunk); level /= ASSOC_ARRAY_KEY_CHUNK_SIZE; switch (level) { case 0: return index_key->hash; case 1: return index_key->x; case 2: return (unsigned long)index_key->type; case 3: return (unsigned long)index_key->domain_tag; default: level -= 4; if (desc_len <= sizeof(index_key->desc)) return 0; d = index_key->description + sizeof(index_key->desc); d += level * sizeof(long); desc_len -= sizeof(index_key->desc); if (desc_len > n) desc_len = n; do { chunk <<= 8; chunk |= *d++; } while (--desc_len > 0); return chunk; } } static unsigned long keyring_get_object_key_chunk(const void *object, int level) { const struct key *key = keyring_ptr_to_key(object); return keyring_get_key_chunk(&key->index_key, level); } static bool keyring_compare_object(const void *object, const void *data) { const struct keyring_index_key *index_key = data; const struct key *key = keyring_ptr_to_key(object); return key->index_key.type == index_key->type && key->index_key.domain_tag == index_key->domain_tag && key->index_key.desc_len == index_key->desc_len && memcmp(key->index_key.description, index_key->description, index_key->desc_len) == 0; } /* * Compare the index keys of a pair of objects and determine the bit position * at which they differ - if they differ. */ static int keyring_diff_objects(const void *object, const void *data) { const struct key *key_a = keyring_ptr_to_key(object); const struct keyring_index_key *a = &key_a->index_key; const struct keyring_index_key *b = data; unsigned long seg_a, seg_b; int level, i; level = 0; seg_a = a->hash; seg_b = b->hash; if ((seg_a ^ seg_b) != 0) goto differ; level += ASSOC_ARRAY_KEY_CHUNK_SIZE / 8; /* The number of bits contributed by the hash is controlled by a * constant in the assoc_array headers. Everything else thereafter we * can deal with as being machine word-size dependent. */ seg_a = a->x; seg_b = b->x; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); /* The next bit may not work on big endian */ seg_a = (unsigned long)a->type; seg_b = (unsigned long)b->type; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); seg_a = (unsigned long)a->domain_tag; seg_b = (unsigned long)b->domain_tag; if ((seg_a ^ seg_b) != 0) goto differ; level += sizeof(unsigned long); i = sizeof(a->desc); if (a->desc_len <= i) goto same; for (; i < a->desc_len; i++) { seg_a = *(unsigned char *)(a->description + i); seg_b = *(unsigned char *)(b->description + i); if ((seg_a ^ seg_b) != 0) goto differ_plus_i; } same: return -1; differ_plus_i: level += i; differ: i = level * 8 + __ffs(seg_a ^ seg_b); return i; } /* * Free an object after stripping the keyring flag off of the pointer. */ static void keyring_free_object(void *object) { key_put(keyring_ptr_to_key(object)); } /* * Operations for keyring management by the index-tree routines. */ static const struct assoc_array_ops keyring_assoc_array_ops = { .get_key_chunk = keyring_get_key_chunk, .get_object_key_chunk = keyring_get_object_key_chunk, .compare_object = keyring_compare_object, .diff_objects = keyring_diff_objects, .free_object = keyring_free_object, }; /* * Clean up a keyring when it is destroyed. Unpublish its name if it had one * and dispose of its data. * * The garbage collector detects the final key_put(), removes the keyring from * the serial number tree and then does RCU synchronisation before coming here, * so we shouldn't need to worry about code poking around here with the RCU * readlock held by this time. */ static void keyring_destroy(struct key *keyring) { if (keyring->description) { write_lock(&keyring_name_lock); if (keyring->name_link.next != NULL && !list_empty(&keyring->name_link)) list_del(&keyring->name_link); write_unlock(&keyring_name_lock); } if (keyring->restrict_link) { struct key_restriction *keyres = keyring->restrict_link; key_put(keyres->key); kfree(keyres); } assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops); } /* * Describe a keyring for /proc. */ static void keyring_describe(const struct key *keyring, struct seq_file *m) { if (keyring->description) seq_puts(m, keyring->description); else seq_puts(m, "[anon]"); if (key_is_positive(keyring)) { if (keyring->keys.nr_leaves_on_tree != 0) seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else seq_puts(m, ": empty"); } } struct keyring_read_iterator_context { size_t buflen; size_t count; key_serial_t *buffer; }; static int keyring_read_iterator(const void *object, void *data) { struct keyring_read_iterator_context *ctx = data; const struct key *key = keyring_ptr_to_key(object); kenter("{%s,%d},,{%zu/%zu}", key->type->name, key->serial, ctx->count, ctx->buflen); if (ctx->count >= ctx->buflen) return 1; *ctx->buffer++ = key->serial; ctx->count += sizeof(key->serial); return 0; } /* * Read a list of key IDs from the keyring's contents in binary form * * The keyring's semaphore is read-locked by the caller. This prevents someone * from modifying it under us - which could cause us to read key IDs multiple * times. */ static long keyring_read(const struct key *keyring, char *buffer, size_t buflen) { struct keyring_read_iterator_context ctx; long ret; kenter("{%d},,%zu", key_serial(keyring), buflen); if (buflen & (sizeof(key_serial_t) - 1)) return -EINVAL; /* Copy as many key IDs as fit into the buffer */ if (buffer && buflen) { ctx.buffer = (key_serial_t *)buffer; ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { kleave(" = %ld [iterate]", ret); return ret; } } /* Return the size of the buffer needed */ ret = keyring->keys.nr_leaves_on_tree * sizeof(key_serial_t); if (ret <= buflen) kleave("= %ld [ok]", ret); else kleave("= %ld [buffer too small]", ret); return ret; } /* * Allocate a keyring and link into the destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, key_perm_t perm, unsigned long flags, struct key_restriction *restrict_link, struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, uid, gid, cred, perm, flags, restrict_link); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { key_put(keyring); keyring = ERR_PTR(ret); } } return keyring; } EXPORT_SYMBOL(keyring_alloc); /** * restrict_link_reject - Give -EPERM to restrict link * @keyring: The keyring being added to. * @type: The type of key being added. * @payload: The payload of the key intended to be added. * @restriction_key: Keys providing additional data for evaluating restriction. * * Reject the addition of any links to a keyring. It can be overridden by * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when * adding a key to a keyring. * * This is meant to be stored in a key_restriction structure which is passed * in the restrict_link parameter to keyring_alloc(). */ int restrict_link_reject(struct key *keyring, const struct key_type *type, const union key_payload *payload, struct key *restriction_key) { return -EPERM; } /* * By default, we keys found by getting an exact match on their descriptions. */ bool key_default_cmp(const struct key *key, const struct key_match_data *match_data) { return strcmp(key->description, match_data->raw_data) == 0; } /* * Iteration function to consider each key found. */ static int keyring_search_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); unsigned long kflags = READ_ONCE(key->flags); short state = READ_ONCE(key->state); kenter("{%d}", key->serial); /* ignore keys not of this type */ if (key->type != ctx->index_key.type) { kleave(" = 0 [!type]"); return 0; } /* skip invalidated, revoked and expired keys */ if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { time64_t expiry = READ_ONCE(key->expiry); if (kflags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { ctx->result = ERR_PTR(-EKEYREVOKED); kleave(" = %d [invrev]", ctx->skipped_ret); goto skipped; } if (expiry && ctx->now >= expiry) { if (!(ctx->flags & KEYRING_SEARCH_SKIP_EXPIRED)) ctx->result = ERR_PTR(-EKEYEXPIRED); kleave(" = %d [expire]", ctx->skipped_ret); goto skipped; } } /* keys that don't match */ if (!ctx->match_data.cmp(key, &ctx->match_data)) { kleave(" = 0 [!match]"); return 0; } /* key must have search permissions */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) { ctx->result = ERR_PTR(-EACCES); kleave(" = %d [!perm]", ctx->skipped_ret); goto skipped; } if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { /* we set a different error code if we pass a negative key */ if (state < 0) { ctx->result = ERR_PTR(state); kleave(" = %d [neg]", ctx->skipped_ret); goto skipped; } } /* Found */ ctx->result = make_key_ref(key, ctx->possessed); kleave(" = 1 [found]"); return 1; skipped: return ctx->skipped_ret; } /* * Search inside a keyring for a key. We can search by walking to it * directly based on its index-key or we can iterate over the entire * tree looking for it, based on the match function. */ static int search_keyring(struct key *keyring, struct keyring_search_context *ctx) { if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_DIRECT) { const void *object; object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, &ctx->index_key); return object ? ctx->iterator(object, ctx) : 0; } return assoc_array_iterate(&keyring->keys, ctx->iterator, ctx); } /* * Search a tree of keyrings that point to other keyrings up to the maximum * depth. */ static bool search_nested_keyrings(struct key *keyring, struct keyring_search_context *ctx) { struct { struct key *keyring; struct assoc_array_node *node; int slot; } stack[KEYRING_SEARCH_MAX_DEPTH]; struct assoc_array_shortcut *shortcut; struct assoc_array_node *node; struct assoc_array_ptr *ptr; struct key *key; int sp = 0, slot; kenter("{%d},{%s,%s}", keyring->serial, ctx->index_key.type->name, ctx->index_key.description); #define STATE_CHECKS (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_DO_STATE_CHECK) BUG_ON((ctx->flags & STATE_CHECKS) == 0 || (ctx->flags & STATE_CHECKS) == STATE_CHECKS); if (ctx->index_key.description) key_set_index_key(&ctx->index_key); /* Check to see if this top-level keyring is what we are looking for * and whether it is valid or not. */ if (ctx->match_data.lookup_type == KEYRING_SEARCH_LOOKUP_ITERATE || keyring_compare_object(keyring, &ctx->index_key)) { ctx->skipped_ret = 2; switch (ctx->iterator(keyring_key_to_ptr(keyring), ctx)) { case 1: goto found; case 2: return false; default: break; } } ctx->skipped_ret = 0; /* Start processing a new keyring */ descend_to_keyring: kdebug("descend to %d", keyring->serial); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto not_this_keyring; /* Search through the keys in this keyring before its searching its * subtrees. */ if (search_keyring(keyring, ctx)) goto found; /* Then manually iterate through the keyrings nested in this one. * * Start from the root node of the index tree. Because of the way the * hash function has been set up, keyrings cluster on the leftmost * branch of the root node (root slot 0) or in the root node itself. * Non-keyrings avoid the leftmost branch of the root entirely (root * slots 1-15). */ if (!(ctx->flags & KEYRING_SEARCH_RECURSE)) goto not_this_keyring; ptr = READ_ONCE(keyring->keys.root); if (!ptr) goto not_this_keyring; if (assoc_array_ptr_is_shortcut(ptr)) { /* If the root is a shortcut, either the keyring only contains * keyring pointers (everything clusters behind root slot 0) or * doesn't contain any keyring pointers. */ shortcut = assoc_array_ptr_to_shortcut(ptr); if ((shortcut->index_key[0] & ASSOC_ARRAY_FAN_MASK) != 0) goto not_this_keyring; ptr = READ_ONCE(shortcut->next_node); node = assoc_array_ptr_to_node(ptr); goto begin_node; } node = assoc_array_ptr_to_node(ptr); ptr = node->slots[0]; if (!assoc_array_ptr_is_meta(ptr)) goto begin_node; descend_to_node: /* Descend to a more distal node in this keyring's content tree and go * through that. */ kdebug("descend"); if (assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->next_node); BUG_ON(!assoc_array_ptr_is_node(ptr)); } node = assoc_array_ptr_to_node(ptr); begin_node: kdebug("begin_node"); slot = 0; ascend_to_node: /* Go through the slots in a node */ for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) { ptr = READ_ONCE(node->slots[slot]); if (assoc_array_ptr_is_meta(ptr)) { if (node->back_pointer || assoc_array_ptr_is_shortcut(ptr)) goto descend_to_node; } if (!keyring_ptr_is_keyring(ptr)) continue; key = keyring_ptr_to_key(ptr); if (sp >= KEYRING_SEARCH_MAX_DEPTH) { if (ctx->flags & KEYRING_SEARCH_DETECT_TOO_DEEP) { ctx->result = ERR_PTR(-ELOOP); return false; } goto not_this_keyring; } /* Search a nested keyring */ if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM) && key_task_permission(make_key_ref(key, ctx->possessed), ctx->cred, KEY_NEED_SEARCH) < 0) continue; /* stack the current position */ stack[sp].keyring = keyring; stack[sp].node = node; stack[sp].slot = slot; sp++; /* begin again with the new keyring */ keyring = key; goto descend_to_keyring; } /* We've dealt with all the slots in the current node, so now we need * to ascend to the parent and continue processing there. */ ptr = READ_ONCE(node->back_pointer); slot = node->parent_slot; if (ptr && assoc_array_ptr_is_shortcut(ptr)) { shortcut = assoc_array_ptr_to_shortcut(ptr); ptr = READ_ONCE(shortcut->back_pointer); slot = shortcut->parent_slot; } if (!ptr) goto not_this_keyring; node = assoc_array_ptr_to_node(ptr); slot++; /* If we've ascended to the root (zero backpointer), we must have just * finished processing the leftmost branch rather than the root slots - * so there can't be any more keyrings for us to find. */ if (node->back_pointer) { kdebug("ascend %d", slot); goto ascend_to_node; } /* The keyring we're looking at was disqualified or didn't contain a * matching key. */ not_this_keyring: kdebug("not_this_keyring %d", sp); if (sp <= 0) { kleave(" = false"); return false; } /* Resume the processing of a keyring higher up in the tree */ sp--; keyring = stack[sp].keyring; node = stack[sp].node; slot = stack[sp].slot + 1; kdebug("ascend to %d [%d]", keyring->serial, slot); goto ascend_to_node; /* We found a viable match */ found: key = key_ref_to_ptr(ctx->result); key_check(key); if (!(ctx->flags & KEYRING_SEARCH_NO_UPDATE_TIME)) { key->last_used_at = ctx->now; keyring->last_used_at = ctx->now; while (sp > 0) stack[--sp].keyring->last_used_at = ctx->now; } kleave(" = true"); return true; } /** * keyring_search_rcu - Search a keyring tree for a matching key under RCU * @keyring_ref: A pointer to the keyring with possession indicator. * @ctx: The keyring search context. * * Search the supplied keyring tree for a key that matches the criteria given. * The root keyring and any linked keyrings must grant Search permission to the * caller to be searchable and keys can only be found if they too grant Search * to the caller. The possession flag on the root keyring pointer controls use * of the possessor bits in permissions checking of the entire tree. In * addition, the LSM gets to forbid keyring searches and key matches. * * The search is performed as a breadth-then-depth search up to the prescribed * limit (KEYRING_SEARCH_MAX_DEPTH). The caller must hold the RCU read lock to * prevent keyrings from being destroyed or rearranged whilst they are being * searched. * * Keys are matched to the type provided and are then filtered by the match * function, which is given the description to use in any way it sees fit. The * match function may use any attributes of a key that it wishes to * determine the match. Normally the match function from the key type would be * used. * * RCU can be used to prevent the keyring key lists from disappearing without * the need to take lots of locks. * * Returns a pointer to the found key and increments the key usage count if * successful; -EAGAIN if no matching keys were found, or if expired or revoked * keys were found; -ENOKEY if only negative keys were found; -ENOTDIR if the * specified keyring wasn't a keyring. * * In the case of a successful return, the possession attribute from * @keyring_ref is propagated to the returned key reference. */ key_ref_t keyring_search_rcu(key_ref_t keyring_ref, struct keyring_search_context *ctx) { struct key *keyring; long err; ctx->iterator = keyring_search_iterator; ctx->possessed = is_key_possessed(keyring_ref); ctx->result = ERR_PTR(-EAGAIN); keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return ERR_PTR(-ENOTDIR); if (!(ctx->flags & KEYRING_SEARCH_NO_CHECK_PERM)) { err = key_task_permission(keyring_ref, ctx->cred, KEY_NEED_SEARCH); if (err < 0) return ERR_PTR(err); } ctx->now = ktime_get_real_seconds(); if (search_nested_keyrings(keyring, ctx)) __key_get(key_ref_to_ptr(ctx->result)); return ctx->result; } /** * keyring_search - Search the supplied keyring tree for a matching key * @keyring: The root of the keyring tree to be searched. * @type: The type of keyring we want to find. * @description: The name of the keyring we want to find. * @recurse: True to search the children of @keyring also * * As keyring_search_rcu() above, but using the current task's credentials and * type's default matching function and preferred search method. */ key_ref_t keyring_search(key_ref_t keyring, struct key_type *type, const char *description, bool recurse) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; key_ref_t key; int ret; if (recurse) ctx.flags |= KEYRING_SEARCH_RECURSE; if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) return ERR_PTR(ret); } rcu_read_lock(); key = keyring_search_rcu(keyring, &ctx); rcu_read_unlock(); if (type->match_free) type->match_free(&ctx.match_data); return key; } EXPORT_SYMBOL(keyring_search); static struct key_restriction *keyring_restriction_alloc( key_restrict_link_func_t check) { struct key_restriction *keyres = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; return keyres; } /* * Semaphore to serialise restriction setup to prevent reference count * cycles through restriction key pointers. */ static DECLARE_RWSEM(keyring_serialise_restrict_sem); /* * Check for restriction cycles that would prevent keyring garbage collection. * keyring_serialise_restrict_sem must be held. */ static bool keyring_detect_restriction_cycle(const struct key *dest_keyring, struct key_restriction *keyres) { while (keyres && keyres->key && keyres->key->type == &key_type_keyring) { if (keyres->key == dest_keyring) return true; keyres = keyres->key->restrict_link; } return false; } /** * keyring_restrict - Look up and apply a restriction to a keyring * @keyring_ref: The keyring to be restricted * @type: The key type that will provide the restriction checker. * @restriction: The restriction options to apply to the keyring * * Look up a keyring and apply a restriction to it. The restriction is managed * by the specific key type, but can be configured by the options specified in * the restriction string. */ int keyring_restrict(key_ref_t keyring_ref, const char *type, const char *restriction) { struct key *keyring; struct key_type *restrict_type = NULL; struct key_restriction *restrict_link; int ret = 0; keyring = key_ref_to_ptr(keyring_ref); key_check(keyring); if (keyring->type != &key_type_keyring) return -ENOTDIR; if (!type) { restrict_link = keyring_restriction_alloc(restrict_link_reject); } else { restrict_type = key_type_lookup(type); if (IS_ERR(restrict_type)) return PTR_ERR(restrict_type); if (!restrict_type->lookup_restriction) { ret = -ENOENT; goto error; } restrict_link = restrict_type->lookup_restriction(restriction); } if (IS_ERR(restrict_link)) { ret = PTR_ERR(restrict_link); goto error; } down_write(&keyring->sem); down_write(&keyring_serialise_restrict_sem); if (keyring->restrict_link) { ret = -EEXIST; } else if (keyring_detect_restriction_cycle(keyring, restrict_link)) { ret = -EDEADLK; } else { keyring->restrict_link = restrict_link; notify_key(keyring, NOTIFY_KEY_SETATTR, 0); } up_write(&keyring_serialise_restrict_sem); up_write(&keyring->sem); if (ret < 0) { key_put(restrict_link->key); kfree(restrict_link); } error: if (restrict_type) key_type_put(restrict_type); return ret; } EXPORT_SYMBOL(keyring_restrict); /* * Search the given keyring for a key that might be updated. * * The caller must guarantee that the keyring is a keyring and that the * permission is granted to modify the keyring as no check is made here. The * caller must also hold a lock on the keyring semaphore. * * Returns a pointer to the found key with usage count incremented if * successful and returns NULL if not found. Revoked and invalidated keys are * skipped over. * * If successful, the possession indicator is propagated from the keyring ref * to the returned key reference. */ key_ref_t find_key_to_update(key_ref_t keyring_ref, const struct keyring_index_key *index_key) { struct key *keyring, *key; const void *object; keyring = key_ref_to_ptr(keyring_ref); kenter("{%d},{%s,%s}", keyring->serial, index_key->type->name, index_key->description); object = assoc_array_find(&keyring->keys, &keyring_assoc_array_ops, index_key); if (object) goto found; kleave(" = NULL"); return NULL; found: key = keyring_ptr_to_key(object); if (key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) { kleave(" = NULL [x]"); return NULL; } __key_get(key); kleave(" = {%d}", key->serial); return make_key_ref(key, is_key_possessed(keyring_ref)); } /* * Find a keyring with the specified name. * * Only keyrings that have nonzero refcount, are not revoked, and are owned by a * user in the current user namespace are considered. If @uid_keyring is %true, * the keyring additionally must have been allocated as a user or user session * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct user_namespace *ns = current_user_ns(); struct key *keyring; if (!name) return ERR_PTR(-EINVAL); read_lock(&keyring_name_lock); /* Search this hash bucket for a keyring with a matching name that * grants Search permission and that hasn't been revoked */ list_for_each_entry(keyring, &ns->keyring_name_list, name_link) { if (!kuid_has_mapping(ns, keyring->user->uid)) continue; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) continue; if (strcmp(keyring->description, name) != 0) continue; if (uid_keyring) { if (!test_bit(KEY_FLAG_UID_KEYRING, &keyring->flags)) continue; } else { if (key_permission(make_key_ref(keyring, 0), KEY_NEED_SEARCH) < 0) continue; } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' * (ie. it has a zero usage count) */ if (!refcount_inc_not_zero(&keyring->usage)) continue; keyring->last_used_at = ktime_get_real_seconds(); goto out; } keyring = ERR_PTR(-ENOKEY); out: read_unlock(&keyring_name_lock); return keyring; } static int keyring_detect_cycle_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); kenter("{%d}", key->serial); /* We might get a keyring with matching index-key that is nonetheless a * different keyring. */ if (key != ctx->match_data.raw_data) return 0; ctx->result = ERR_PTR(-EDEADLK); return 1; } /* * See if a cycle will be created by inserting acyclic tree B in acyclic * tree A at the topmost level (ie: as a direct child of A). * * Since we are adding B to A at the top level, checking for cycles should just * be a matter of seeing if node A is somewhere in tree B. */ static int keyring_detect_cycle(struct key *A, struct key *B) { struct keyring_search_context ctx = { .index_key = A->index_key, .match_data.raw_data = A, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .iterator = keyring_detect_cycle_iterator, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_NO_UPDATE_TIME | KEYRING_SEARCH_NO_CHECK_PERM | KEYRING_SEARCH_DETECT_TOO_DEEP | KEYRING_SEARCH_RECURSE), }; rcu_read_lock(); search_nested_keyrings(B, &ctx); rcu_read_unlock(); return PTR_ERR(ctx.result) == -EAGAIN ? 0 : PTR_ERR(ctx.result); } /* * Lock keyring for link. */ int __key_link_lock(struct key *keyring, const struct keyring_index_key *index_key) __acquires(&keyring->sem) __acquires(&keyring_serialise_link_lock) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Lock keyrings for move (link/unlink combination). */ int __key_move_lock(struct key *l_keyring, struct key *u_keyring, const struct keyring_index_key *index_key) __acquires(&l_keyring->sem) __acquires(&u_keyring->sem) __acquires(&keyring_serialise_link_lock) { if (l_keyring->type != &key_type_keyring || u_keyring->type != &key_type_keyring) return -ENOTDIR; /* We have to be very careful here to take the keyring locks in the * right order, lest we open ourselves to deadlocking against another * move operation. */ if (l_keyring < u_keyring) { down_write(&l_keyring->sem); down_write_nested(&u_keyring->sem, 1); } else { down_write(&u_keyring->sem); down_write_nested(&l_keyring->sem, 1); } /* Serialise link/link calls to prevent parallel calls causing a cycle * when linking two keyring in opposite orders. */ if (index_key->type == &key_type_keyring) mutex_lock(&keyring_serialise_link_lock); return 0; } /* * Preallocate memory so that a key can be linked into to a keyring. */ int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; int ret; kenter("%d,%s,%s,", keyring->serial, index_key->type->name, index_key->description); BUG_ON(index_key->desc_len == 0); BUG_ON(*_edit != NULL); *_edit = NULL; ret = -EKEYREVOKED; if (test_bit(KEY_FLAG_REVOKED, &keyring->flags)) goto error; /* Create an edit script that will insert/replace the key in the * keyring tree. */ edit = assoc_array_insert(&keyring->keys, &keyring_assoc_array_ops, index_key, NULL); if (IS_ERR(edit)) { ret = PTR_ERR(edit); goto error; } /* If we're not replacing a link in-place then we're going to need some * extra quota. */ if (!edit->dead_leaf) { ret = key_payload_reserve(keyring, keyring->datalen + KEYQUOTA_LINK_BYTES); if (ret < 0) goto error_cancel; } *_edit = edit; kleave(" = 0"); return 0; error_cancel: assoc_array_cancel_edit(edit); error: kleave(" = %d", ret); return ret; } /* * Check already instantiated keys aren't going to be a problem. * * The caller must have called __key_link_begin(). Don't need to call this for * keys that were created since __key_link_begin() was called. */ int __key_link_check_live_key(struct key *keyring, struct key *key) { if (key->type == &key_type_keyring) /* check that we aren't going to create a cycle by linking one * keyring to another */ return keyring_detect_cycle(keyring, key); return 0; } /* * Link a key into to a keyring. * * Must be called with __key_link_begin() having being called. Discards any * already extant link to matching key if there is one, so that each keyring * holds at most one link to any given key of a particular type+description * combination. */ void __key_link(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { __key_get(key); assoc_array_insert_set_object(*_edit, keyring_key_to_ptr(key)); assoc_array_apply_edit(*_edit); *_edit = NULL; notify_key(keyring, NOTIFY_KEY_LINKED, key_serial(key)); } /* * Finish linking a key into to a keyring. * * Must be called with __key_link_begin() having being called. */ void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit) __releases(&keyring->sem) __releases(&keyring_serialise_link_lock) { BUG_ON(index_key->type == NULL); kenter("%d,%s,", keyring->serial, index_key->type->name); if (edit) { if (!edit->dead_leaf) { key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } assoc_array_cancel_edit(edit); } up_write(&keyring->sem); if (index_key->type == &key_type_keyring) mutex_unlock(&keyring_serialise_link_lock); } /* * Check addition of keys to restricted keyrings. */ static int __key_link_check_restriction(struct key *keyring, struct key *key) { if (!keyring->restrict_link || !keyring->restrict_link->check) return 0; return keyring->restrict_link->check(keyring, key->type, &key->payload, keyring->restrict_link->key); } /** * key_link - Link a key to a keyring * @keyring: The keyring to make the link in. * @key: The key to link to. * * Make a link in a keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring. * * This function will write-lock the keyring's semaphore and will consume some * of the user's key data quota to hold the link. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, * -EKEYREVOKED if the keyring has been revoked, -ENFILE if the keyring is * full, -EDQUOT if there is insufficient key data quota remaining to add * another link or -ENOMEM if there's insufficient memory. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_link(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; kenter("{%d,%d}", keyring->serial, refcount_read(&keyring->usage)); key_check(keyring); key_check(key); ret = __key_link_lock(keyring, &key->index_key); if (ret < 0) goto error; ret = __key_link_begin(keyring, &key->index_key, &edit); if (ret < 0) goto error_end; kdebug("begun {%d,%d}", keyring->serial, refcount_read(&keyring->usage)); ret = __key_link_check_restriction(keyring, key); if (ret == 0) ret = __key_link_check_live_key(keyring, key); if (ret == 0) __key_link(keyring, key, &edit); error_end: __key_link_end(keyring, &key->index_key, edit); error: kleave(" = %d {%d,%d}", ret, keyring->serial, refcount_read(&keyring->usage)); return ret; } EXPORT_SYMBOL(key_link); /* * Lock a keyring for unlink. */ static int __key_unlink_lock(struct key *keyring) __acquires(&keyring->sem) { if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); return 0; } /* * Begin the process of unlinking a key from a keyring. */ static int __key_unlink_begin(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { struct assoc_array_edit *edit; BUG_ON(*_edit != NULL); edit = assoc_array_delete(&keyring->keys, &keyring_assoc_array_ops, &key->index_key); if (IS_ERR(edit)) return PTR_ERR(edit); if (!edit) return -ENOENT; *_edit = edit; return 0; } /* * Apply an unlink change. */ static void __key_unlink(struct key *keyring, struct key *key, struct assoc_array_edit **_edit) { assoc_array_apply_edit(*_edit); notify_key(keyring, NOTIFY_KEY_UNLINKED, key_serial(key)); *_edit = NULL; key_payload_reserve(keyring, keyring->datalen - KEYQUOTA_LINK_BYTES); } /* * Finish unlinking a key from to a keyring. */ static void __key_unlink_end(struct key *keyring, struct key *key, struct assoc_array_edit *edit) __releases(&keyring->sem) { if (edit) assoc_array_cancel_edit(edit); up_write(&keyring->sem); } /** * key_unlink - Unlink the first link to a key from a keyring. * @keyring: The keyring to remove the link from. * @key: The key the link is to. * * Remove a link from a keyring to a key. * * This function will write-lock the keyring's semaphore. * * Returns 0 if successful, -ENOTDIR if the keyring isn't a keyring, -ENOENT if * the key isn't linked to by the keyring or -ENOMEM if there's insufficient * memory. * * It is assumed that the caller has checked that it is permitted for a link to * be removed (the keyring should have Write permission; no permissions are * required on the key). */ int key_unlink(struct key *keyring, struct key *key) { struct assoc_array_edit *edit = NULL; int ret; key_check(keyring); key_check(key); ret = __key_unlink_lock(keyring); if (ret < 0) return ret; ret = __key_unlink_begin(keyring, key, &edit); if (ret == 0) __key_unlink(keyring, key, &edit); __key_unlink_end(keyring, key, edit); return ret; } EXPORT_SYMBOL(key_unlink); /** * key_move - Move a key from one keyring to another * @key: The key to move * @from_keyring: The keyring to remove the link from. * @to_keyring: The keyring to make the link in. * @flags: Qualifying flags, such as KEYCTL_MOVE_EXCL. * * Make a link in @to_keyring to a key, such that the keyring holds a reference * on that key and the key can potentially be found by searching that keyring * whilst simultaneously removing a link to the key from @from_keyring. * * This function will write-lock both keyring's semaphores and will consume * some of the user's key data quota to hold the link on @to_keyring. * * Returns 0 if successful, -ENOTDIR if either keyring isn't a keyring, * -EKEYREVOKED if either keyring has been revoked, -ENFILE if the second * keyring is full, -EDQUOT if there is insufficient key data quota remaining * to add another link or -ENOMEM if there's insufficient memory. If * KEYCTL_MOVE_EXCL is set, then -EEXIST will be returned if there's already a * matching key in @to_keyring. * * It is assumed that the caller has checked that it is permitted for a link to * be made (the keyring should have Write permission and the key Link * permission). */ int key_move(struct key *key, struct key *from_keyring, struct key *to_keyring, unsigned int flags) { struct assoc_array_edit *from_edit = NULL, *to_edit = NULL; int ret; kenter("%d,%d,%d", key->serial, from_keyring->serial, to_keyring->serial); if (from_keyring == to_keyring) return 0; key_check(key); key_check(from_keyring); key_check(to_keyring); ret = __key_move_lock(from_keyring, to_keyring, &key->index_key); if (ret < 0) goto out; ret = __key_unlink_begin(from_keyring, key, &from_edit); if (ret < 0) goto error; ret = __key_link_begin(to_keyring, &key->index_key, &to_edit); if (ret < 0) goto error; ret = -EEXIST; if (to_edit->dead_leaf && (flags & KEYCTL_MOVE_EXCL)) goto error; ret = __key_link_check_restriction(to_keyring, key); if (ret < 0) goto error; ret = __key_link_check_live_key(to_keyring, key); if (ret < 0) goto error; __key_unlink(from_keyring, key, &from_edit); __key_link(to_keyring, key, &to_edit); error: __key_link_end(to_keyring, &key->index_key, to_edit); __key_unlink_end(from_keyring, key, from_edit); out: kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(key_move); /** * keyring_clear - Clear a keyring * @keyring: The keyring to clear. * * Clear the contents of the specified keyring. * * Returns 0 if successful or -ENOTDIR if the keyring isn't a keyring. */ int keyring_clear(struct key *keyring) { struct assoc_array_edit *edit; int ret; if (keyring->type != &key_type_keyring) return -ENOTDIR; down_write(&keyring->sem); edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (IS_ERR(edit)) { ret = PTR_ERR(edit); } else { if (edit) assoc_array_apply_edit(edit); notify_key(keyring, NOTIFY_KEY_CLEARED, 0); key_payload_reserve(keyring, 0); ret = 0; } up_write(&keyring->sem); return ret; } EXPORT_SYMBOL(keyring_clear); /* * Dispose of the links from a revoked keyring. * * This is called with the key sem write-locked. */ static void keyring_revoke(struct key *keyring) { struct assoc_array_edit *edit; edit = assoc_array_clear(&keyring->keys, &keyring_assoc_array_ops); if (!IS_ERR(edit)) { if (edit) assoc_array_apply_edit(edit); key_payload_reserve(keyring, 0); } } static bool keyring_gc_select_iterator(void *object, void *iterator_data) { struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; if (key_is_dead(key, *limit)) return false; key_get(key); return true; } static int keyring_gc_check_iterator(const void *object, void *iterator_data) { const struct key *key = keyring_ptr_to_key(object); time64_t *limit = iterator_data; key_check(key); return key_is_dead(key, *limit); } /* * Garbage collect pointers from a keyring. * * Not called with any locks held. The keyring's key struct will not be * deallocated under us as only our caller may deallocate it. */ void keyring_gc(struct key *keyring, time64_t limit) { int result; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); if (keyring->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED))) goto dont_gc; /* scan the keyring looking for dead keys */ rcu_read_lock(); result = assoc_array_iterate(&keyring->keys, keyring_gc_check_iterator, &limit); rcu_read_unlock(); if (result == true) goto do_gc; dont_gc: kleave(" [no gc]"); return; do_gc: down_write(&keyring->sem); assoc_array_gc(&keyring->keys, &keyring_assoc_array_ops, keyring_gc_select_iterator, &limit); up_write(&keyring->sem); kleave(" [gc]"); } /* * Garbage collect restriction pointers from a keyring. * * Keyring restrictions are associated with a key type, and must be cleaned * up if the key type is unregistered. The restriction is altered to always * reject additional keys so a keyring cannot be opened up by unregistering * a key type. * * Not called with any keyring locks held. The keyring's key struct will not * be deallocated under us as only our caller may deallocate it. * * The caller is required to hold key_types_sem and dead_type->sem. This is * fulfilled by key_gc_keytype() holding the locks on behalf of * key_garbage_collector(), which it invokes on a workqueue. */ void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type) { struct key_restriction *keyres; kenter("%x{%s}", keyring->serial, keyring->description ?: ""); /* * keyring->restrict_link is only assigned at key allocation time * or with the key type locked, so the only values that could be * concurrently assigned to keyring->restrict_link are for key * types other than dead_type. Given this, it's ok to check * the key type before acquiring keyring->sem. */ if (!dead_type || !keyring->restrict_link || keyring->restrict_link->keytype != dead_type) { kleave(" [no restriction gc]"); return; } /* Lock the keyring to ensure that a link is not in progress */ down_write(&keyring->sem); keyres = keyring->restrict_link; keyres->check = restrict_link_reject; key_put(keyres->key); keyres->key = NULL; keyres->keytype = NULL; up_write(&keyring->sem); kleave(" [restriction gc]"); }
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 /* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ /* * aoenet.c * Ethernet portion of AoE driver */ #include <linux/gfp.h> #include <linux/hdreg.h> #include <linux/blkdev.h> #include <linux/netdevice.h> #include <linux/moduleparam.h> #include <net/net_namespace.h> #include <linux/unaligned.h> #include "aoe.h" #define NECODES 5 static char *aoe_errlist[] = { "no such error", "unrecognized command code", "bad argument parameter", "device unavailable", "config string present", "unsupported version" }; enum { IFLISTSZ = 1024, }; static char aoe_iflist[IFLISTSZ]; module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]"); static wait_queue_head_t txwq; static struct ktstate kts; #ifndef MODULE static int __init aoe_iflist_setup(char *str) { strscpy(aoe_iflist, str, IFLISTSZ); return 1; } __setup("aoe_iflist=", aoe_iflist_setup); #endif static spinlock_t txlock; static struct sk_buff_head skbtxq; /* enters with txlock held */ static int tx(int id) __must_hold(&txlock) { struct sk_buff *skb; struct net_device *ifp; while ((skb = skb_dequeue(&skbtxq))) { spin_unlock_irq(&txlock); ifp = skb->dev; if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit()) pr_warn("aoe: packet could not be sent on %s. %s\n", ifp ? ifp->name : "netif", "consider increasing tx_queue_len"); dev_put(ifp); spin_lock_irq(&txlock); } return 0; } int is_aoe_netif(struct net_device *ifp) { register char *p, *q; register int len; if (aoe_iflist[0] == '\0') return 1; p = aoe_iflist + strspn(aoe_iflist, WHITESPACE); for (; *p; p = q + strspn(q, WHITESPACE)) { q = p + strcspn(p, WHITESPACE); if (q != p) len = q - p; else len = strlen(p); /* last token in aoe_iflist */ if (strlen(ifp->name) == len && !strncmp(ifp->name, p, len)) return 1; if (q == p) break; } return 0; } int set_aoe_iflist(const char __user *user_str, size_t size) { if (size >= IFLISTSZ) return -EINVAL; if (copy_from_user(aoe_iflist, user_str, size)) { printk(KERN_INFO "aoe: copy from user failed\n"); return -EFAULT; } aoe_iflist[size] = 0x00; return 0; } void aoenet_xmit(struct sk_buff_head *queue) { struct sk_buff *skb, *tmp; ulong flags; skb_queue_walk_safe(queue, skb, tmp) { __skb_unlink(skb, queue); spin_lock_irqsave(&txlock, flags); skb_queue_tail(&skbtxq, skb); spin_unlock_irqrestore(&txlock, flags); wake_up(&txwq); } } /* * (1) len doesn't include the header by default. I want this. */ static int aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) { struct aoe_hdr *h; struct aoe_atahdr *ah; u32 n; int sn; if (dev_net(ifp) != &init_net) goto exit; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) return 0; if (!is_aoe_netif(ifp)) goto exit; skb_push(skb, ETH_HLEN); /* (1) */ sn = sizeof(*h) + sizeof(*ah); if (skb->len >= sn) { sn -= skb_headlen(skb); if (sn > 0 && !__pskb_pull_tail(skb, sn)) goto exit; } h = (struct aoe_hdr *) skb->data; n = get_unaligned_be32(&h->tag); if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31)) goto exit; if (h->verfl & AOEFL_ERR) { n = h->err; if (n > NECODES) n = 0; if (net_ratelimit()) printk(KERN_ERR "%s%d.%d@%s; ecode=%d '%s'\n", "aoe: error packet from ", get_unaligned_be16(&h->major), h->minor, skb->dev->name, h->err, aoe_errlist[n]); goto exit; } switch (h->cmd) { case AOECMD_ATA: /* ata_rsp may keep skb for later processing or give it back */ skb = aoecmd_ata_rsp(skb); break; case AOECMD_CFG: aoecmd_cfg_rsp(skb); break; default: if (h->cmd >= AOECMD_VEND_MIN) break; /* don't complain about vendor commands */ pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd); break; } if (!skb) return 0; exit: dev_kfree_skb(skb); return 0; } static struct packet_type aoe_pt __read_mostly = { .type = __constant_htons(ETH_P_AOE), .func = aoenet_rcv, }; int __init aoenet_init(void) { skb_queue_head_init(&skbtxq); init_waitqueue_head(&txwq); spin_lock_init(&txlock); kts.lock = &txlock; kts.fn = tx; kts.waitq = &txwq; kts.id = 0; snprintf(kts.name, sizeof(kts.name), "aoe_tx%d", kts.id); if (aoe_ktstart(&kts)) return -EAGAIN; dev_add_pack(&aoe_pt); return 0; } void aoenet_exit(void) { aoe_ktstop(&kts); skb_queue_purge(&skbtxq); dev_remove_pack(&aoe_pt); }
4539 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_COMPAT_H #define _LINUX_COMPAT_H /* * These are the type definitions for the architecture specific * syscall compatibility layer. */ #include <linux/types.h> #include <linux/time.h> #include <linux/stat.h> #include <linux/param.h> /* for HZ */ #include <linux/sem.h> #include <linux/socket.h> #include <linux/if.h> #include <linux/fs.h> #include <linux/aio_abi.h> /* for aio_context_t */ #include <linux/uaccess.h> #include <linux/unistd.h> #include <asm/compat.h> #include <asm/siginfo.h> #include <asm/signal.h> #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER /* * It may be useful for an architecture to override the definitions of the * COMPAT_SYSCALL_DEFINE0 and COMPAT_SYSCALL_DEFINEx() macros, in particular * to use a different calling convention for syscalls. To allow for that, + the prototypes for the compat_sys_*() functions below will *not* be included * if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. */ #include <asm/syscall_wrapper.h> #endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ #ifndef COMPAT_USE_64BIT_TIME #define COMPAT_USE_64BIT_TIME 0 #endif #ifndef __SC_DELOUSE #define __SC_DELOUSE(t,v) ((__force t)(unsigned long)(v)) #endif #ifndef COMPAT_SYSCALL_DEFINE0 #define COMPAT_SYSCALL_DEFINE0(name) \ asmlinkage long compat_sys_##name(void); \ ALLOW_ERROR_INJECTION(compat_sys_##name, ERRNO); \ asmlinkage long compat_sys_##name(void) #endif /* COMPAT_SYSCALL_DEFINE0 */ #define COMPAT_SYSCALL_DEFINE1(name, ...) \ COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE2(name, ...) \ COMPAT_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE3(name, ...) \ COMPAT_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE4(name, ...) \ COMPAT_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE5(name, ...) \ COMPAT_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) #define COMPAT_SYSCALL_DEFINE6(name, ...) \ COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) /* * The asmlinkage stub is aliased to a function named __se_compat_sys_*() which * sign-extends 32-bit ints to longs whenever needed. The actual work is * done within __do_compat_sys_*(). */ #ifndef COMPAT_SYSCALL_DEFINEx #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ __diag_push(); \ __diag_ignore(GCC, 8, "-Wattribute-alias", \ "Type aliasing is used to sanitize syscall arguments");\ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_compat_sys##name)))); \ ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ long ret = __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\ __MAP(x,__SC_TEST,__VA_ARGS__); \ return ret; \ } \ __diag_pop(); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #endif /* COMPAT_SYSCALL_DEFINEx */ struct compat_iovec { compat_uptr_t iov_base; compat_size_t iov_len; }; #ifndef compat_user_stack_pointer #define compat_user_stack_pointer() current_user_stack_pointer() #endif #ifndef compat_sigaltstack /* we'll need that for MIPS */ typedef struct compat_sigaltstack { compat_uptr_t ss_sp; int ss_flags; compat_size_t ss_size; } compat_stack_t; #endif #ifndef COMPAT_MINSIGSTKSZ #define COMPAT_MINSIGSTKSZ MINSIGSTKSZ #endif #define compat_jiffies_to_clock_t(x) \ (((unsigned long)(x) * COMPAT_USER_HZ) / HZ) typedef __compat_uid32_t compat_uid_t; typedef __compat_gid32_t compat_gid_t; struct compat_sel_arg_struct; struct rusage; struct old_itimerval32; struct compat_tms { compat_clock_t tms_utime; compat_clock_t tms_stime; compat_clock_t tms_cutime; compat_clock_t tms_cstime; }; #define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW) typedef struct { compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize); struct compat_sigaction { #ifndef __ARCH_HAS_IRIX_SIGACTION compat_uptr_t sa_handler; compat_ulong_t sa_flags; #else compat_uint_t sa_flags; compat_uptr_t sa_handler; #endif #ifdef __ARCH_HAS_SA_RESTORER compat_uptr_t sa_restorer; #endif compat_sigset_t sa_mask __packed; }; typedef union compat_sigval { compat_int_t sival_int; compat_uptr_t sival_ptr; } compat_sigval_t; typedef struct compat_siginfo { int si_signo; #ifndef __ARCH_HAS_SWAPPED_SIGINFO int si_errno; int si_code; #else int si_code; int si_errno; #endif union { int _pad[128/sizeof(int) - 3]; /* kill() */ struct { compat_pid_t _pid; /* sender's pid */ __compat_uid32_t _uid; /* sender's uid */ } _kill; /* POSIX.1b timers */ struct { compat_timer_t _tid; /* timer id */ int _overrun; /* overrun count */ compat_sigval_t _sigval; /* same as below */ } _timer; /* POSIX.1b signals */ struct { compat_pid_t _pid; /* sender's pid */ __compat_uid32_t _uid; /* sender's uid */ compat_sigval_t _sigval; } _rt; /* SIGCHLD */ struct { compat_pid_t _pid; /* which child */ __compat_uid32_t _uid; /* sender's uid */ int _status; /* exit code */ compat_clock_t _utime; compat_clock_t _stime; } _sigchld; #ifdef CONFIG_X86_X32_ABI /* SIGCHLD (x32 version) */ struct { compat_pid_t _pid; /* which child */ __compat_uid32_t _uid; /* sender's uid */ int _status; /* exit code */ compat_s64 _utime; compat_s64 _stime; } _sigchld_x32; #endif /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGTRAP, SIGEMT */ struct { compat_uptr_t _addr; /* faulting insn/memory ref. */ #define __COMPAT_ADDR_BND_PKEY_PAD (__alignof__(compat_uptr_t) < sizeof(short) ? \ sizeof(short) : __alignof__(compat_uptr_t)) union { /* used on alpha and sparc */ int _trapno; /* TRAP # which caused the signal */ /* * used when si_code=BUS_MCEERR_AR or * used when si_code=BUS_MCEERR_AO */ short int _addr_lsb; /* Valid LSB of the reported address. */ /* used when si_code=SEGV_BNDERR */ struct { char _dummy_bnd[__COMPAT_ADDR_BND_PKEY_PAD]; compat_uptr_t _lower; compat_uptr_t _upper; } _addr_bnd; /* used when si_code=SEGV_PKUERR */ struct { char _dummy_pkey[__COMPAT_ADDR_BND_PKEY_PAD]; u32 _pkey; } _addr_pkey; /* used when si_code=TRAP_PERF */ struct { compat_ulong_t _data; u32 _type; u32 _flags; } _perf; }; } _sigfault; /* SIGPOLL */ struct { compat_long_t _band; /* POLL_IN, POLL_OUT, POLL_MSG */ int _fd; } _sigpoll; struct { compat_uptr_t _call_addr; /* calling user insn */ int _syscall; /* triggering system call number */ unsigned int _arch; /* AUDIT_ARCH_* of syscall */ } _sigsys; } _sifields; } compat_siginfo_t; struct compat_rlimit { compat_ulong_t rlim_cur; compat_ulong_t rlim_max; }; #ifdef __ARCH_NEED_COMPAT_FLOCK64_PACKED #define __ARCH_COMPAT_FLOCK64_PACK __attribute__((packed)) #else #define __ARCH_COMPAT_FLOCK64_PACK #endif struct compat_flock { short l_type; short l_whence; compat_off_t l_start; compat_off_t l_len; #ifdef __ARCH_COMPAT_FLOCK_EXTRA_SYSID __ARCH_COMPAT_FLOCK_EXTRA_SYSID #endif compat_pid_t l_pid; #ifdef __ARCH_COMPAT_FLOCK_PAD __ARCH_COMPAT_FLOCK_PAD #endif }; struct compat_flock64 { short l_type; short l_whence; compat_loff_t l_start; compat_loff_t l_len; compat_pid_t l_pid; #ifdef __ARCH_COMPAT_FLOCK64_PAD __ARCH_COMPAT_FLOCK64_PAD #endif } __ARCH_COMPAT_FLOCK64_PACK; struct compat_rusage { struct old_timeval32 ru_utime; struct old_timeval32 ru_stime; compat_long_t ru_maxrss; compat_long_t ru_ixrss; compat_long_t ru_idrss; compat_long_t ru_isrss; compat_long_t ru_minflt; compat_long_t ru_majflt; compat_long_t ru_nswap; compat_long_t ru_inblock; compat_long_t ru_oublock; compat_long_t ru_msgsnd; compat_long_t ru_msgrcv; compat_long_t ru_nsignals; compat_long_t ru_nvcsw; compat_long_t ru_nivcsw; }; extern int put_compat_rusage(const struct rusage *, struct compat_rusage __user *); struct compat_siginfo; struct __compat_aio_sigset; struct compat_dirent { u32 d_ino; compat_off_t d_off; u16 d_reclen; char d_name[256]; }; struct compat_ustat { compat_daddr_t f_tfree; compat_ino_t f_tinode; char f_fname[6]; char f_fpack[6]; }; #define COMPAT_SIGEV_PAD_SIZE ((SIGEV_MAX_SIZE/sizeof(int)) - 3) typedef struct compat_sigevent { compat_sigval_t sigev_value; compat_int_t sigev_signo; compat_int_t sigev_notify; union { compat_int_t _pad[COMPAT_SIGEV_PAD_SIZE]; compat_int_t _tid; struct { compat_uptr_t _function; compat_uptr_t _attribute; } _sigev_thread; } _sigev_un; } compat_sigevent_t; struct compat_ifmap { compat_ulong_t mem_start; compat_ulong_t mem_end; unsigned short base_addr; unsigned char irq; unsigned char dma; unsigned char port; }; struct compat_if_settings { unsigned int type; /* Type of physical device or protocol */ unsigned int size; /* Size of the data allocated by the caller */ compat_uptr_t ifs_ifsu; /* union of pointers */ }; struct compat_ifreq { union { char ifrn_name[IFNAMSIZ]; /* if name, e.g. "en0" */ } ifr_ifrn; union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct sockaddr ifru_netmask; struct sockaddr ifru_hwaddr; short ifru_flags; compat_int_t ifru_ivalue; compat_int_t ifru_mtu; struct compat_ifmap ifru_map; char ifru_slave[IFNAMSIZ]; /* Just fits the size */ char ifru_newname[IFNAMSIZ]; compat_caddr_t ifru_data; struct compat_if_settings ifru_settings; } ifr_ifru; }; struct compat_ifconf { compat_int_t ifc_len; /* size of buffer */ compat_caddr_t ifcbuf; }; struct compat_robust_list { compat_uptr_t next; }; struct compat_robust_list_head { struct compat_robust_list list; compat_long_t futex_offset; compat_uptr_t list_op_pending; }; #ifdef CONFIG_COMPAT_OLD_SIGACTION struct compat_old_sigaction { compat_uptr_t sa_handler; compat_old_sigset_t sa_mask; compat_ulong_t sa_flags; compat_uptr_t sa_restorer; }; #endif struct compat_keyctl_kdf_params { compat_uptr_t hashname; compat_uptr_t otherinfo; __u32 otherinfolen; __u32 __spare[8]; }; struct compat_stat; struct compat_statfs; struct compat_statfs64; struct compat_old_linux_dirent; struct compat_linux_dirent; struct linux_dirent64; struct compat_msghdr; struct compat_mmsghdr; struct compat_sysinfo; struct compat_sysctl_args; struct compat_kexec_segment; struct compat_mq_attr; struct compat_msgbuf; void copy_siginfo_to_external32(struct compat_siginfo *to, const struct kernel_siginfo *from); int copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo __user *from); int __copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginfo_t *from); #ifndef copy_siginfo_to_user32 #define copy_siginfo_to_user32 __copy_siginfo_to_user32 #endif int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat); /* * Defined inline such that size can be compile time constant, which avoids * CONFIG_HARDENED_USERCOPY complaining about copies from task_struct */ static inline int put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, unsigned int size) { /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ #if defined(__BIG_ENDIAN) && defined(CONFIG_64BIT) compat_sigset_t v; switch (_NSIG_WORDS) { case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; fallthrough; case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; fallthrough; case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; fallthrough; case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; } return copy_to_user(compat, &v, size) ? -EFAULT : 0; #else return copy_to_user(compat, set, size) ? -EFAULT : 0; #endif } #ifdef CONFIG_CPU_BIG_ENDIAN #define unsafe_put_compat_sigset(compat, set, label) do { \ compat_sigset_t __user *__c = compat; \ const sigset_t *__s = set; \ \ switch (_NSIG_WORDS) { \ case 4: \ unsafe_put_user(__s->sig[3] >> 32, &__c->sig[7], label); \ unsafe_put_user(__s->sig[3], &__c->sig[6], label); \ fallthrough; \ case 3: \ unsafe_put_user(__s->sig[2] >> 32, &__c->sig[5], label); \ unsafe_put_user(__s->sig[2], &__c->sig[4], label); \ fallthrough; \ case 2: \ unsafe_put_user(__s->sig[1] >> 32, &__c->sig[3], label); \ unsafe_put_user(__s->sig[1], &__c->sig[2], label); \ fallthrough; \ case 1: \ unsafe_put_user(__s->sig[0] >> 32, &__c->sig[1], label); \ unsafe_put_user(__s->sig[0], &__c->sig[0], label); \ } \ } while (0) #define unsafe_get_compat_sigset(set, compat, label) do { \ const compat_sigset_t __user *__c = compat; \ compat_sigset_word hi, lo; \ sigset_t *__s = set; \ \ switch (_NSIG_WORDS) { \ case 4: \ unsafe_get_user(lo, &__c->sig[7], label); \ unsafe_get_user(hi, &__c->sig[6], label); \ __s->sig[3] = hi | (((long)lo) << 32); \ fallthrough; \ case 3: \ unsafe_get_user(lo, &__c->sig[5], label); \ unsafe_get_user(hi, &__c->sig[4], label); \ __s->sig[2] = hi | (((long)lo) << 32); \ fallthrough; \ case 2: \ unsafe_get_user(lo, &__c->sig[3], label); \ unsafe_get_user(hi, &__c->sig[2], label); \ __s->sig[1] = hi | (((long)lo) << 32); \ fallthrough; \ case 1: \ unsafe_get_user(lo, &__c->sig[1], label); \ unsafe_get_user(hi, &__c->sig[0], label); \ __s->sig[0] = hi | (((long)lo) << 32); \ } \ } while (0) #else #define unsafe_put_compat_sigset(compat, set, label) do { \ compat_sigset_t __user *__c = compat; \ const sigset_t *__s = set; \ \ unsafe_copy_to_user(__c, __s, sizeof(*__c), label); \ } while (0) #define unsafe_get_compat_sigset(set, compat, label) do { \ const compat_sigset_t __user *__c = compat; \ sigset_t *__s = set; \ \ unsafe_copy_from_user(__s, __c, sizeof(*__c), label); \ } while (0) #endif extern int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data); extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data); struct epoll_event; /* fortunately, this one is fixed-layout */ int compat_restore_altstack(const compat_stack_t __user *uss); int __compat_save_altstack(compat_stack_t __user *, unsigned long); #define unsafe_compat_save_altstack(uss, sp, label) do { \ compat_stack_t __user *__uss = uss; \ struct task_struct *t = current; \ unsafe_put_user(ptr_to_compat((void __user *)t->sas_ss_sp), \ &__uss->ss_sp, label); \ unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \ unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \ } while (0); /* * These syscall function prototypes are kept in the same order as * include/uapi/asm-generic/unistd.h. Deprecated or obsolete system calls * go below. * * Please note that these prototypes here are only provided for information * purposes, for static analysis, and for linking from the syscall table. * These functions should not be called elsewhere from kernel code. * * As the syscall calling convention may be different from the default * for architectures overriding the syscall calling convention, do not * include the prototypes if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. */ #ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p); asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr, u32 __user *iocb); asmlinkage long compat_sys_io_pgetevents(compat_aio_context_t ctx_id, compat_long_t min_nr, compat_long_t nr, struct io_event __user *events, struct old_timespec32 __user *timeout, const struct __compat_aio_sigset __user *usig); asmlinkage long compat_sys_io_pgetevents_time64(compat_aio_context_t ctx_id, compat_long_t min_nr, compat_long_t nr, struct io_event __user *events, struct __kernel_timespec __user *timeout, const struct __compat_aio_sigset __user *usig); asmlinkage long compat_sys_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, int timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); asmlinkage long compat_sys_epoll_pwait2(int epfd, struct epoll_event __user *events, int maxevents, const struct __kernel_timespec __user *timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, compat_ulong_t arg); asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, compat_ulong_t arg); asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, compat_ulong_t arg); asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf); asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf); asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf); asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf); asmlinkage long compat_sys_truncate(const char __user *, compat_off_t); asmlinkage long compat_sys_ftruncate(unsigned int, compat_off_t); /* No generic prototype for truncate64, ftruncate64, fallocate */ asmlinkage long compat_sys_openat(int dfd, const char __user *filename, int flags, umode_t mode); asmlinkage long compat_sys_getdents(unsigned int fd, struct compat_linux_dirent __user *dirent, unsigned int count); asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int); /* No generic prototype for pread64 and pwrite64 */ asmlinkage ssize_t compat_sys_preadv(compat_ulong_t fd, const struct iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high); asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd, const struct iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high); #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 asmlinkage long compat_sys_preadv64(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos); #endif #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 asmlinkage long compat_sys_pwritev64(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos); #endif asmlinkage long compat_sys_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, compat_size_t count); asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset, compat_size_t count); asmlinkage long compat_sys_pselect6_time32(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timespec32 __user *tsp, void __user *sig); asmlinkage long compat_sys_pselect6_time64(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct __kernel_timespec __user *tsp, void __user *sig); asmlinkage long compat_sys_ppoll_time32(struct pollfd __user *ufds, unsigned int nfds, struct old_timespec32 __user *tsp, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); asmlinkage long compat_sys_ppoll_time64(struct pollfd __user *ufds, unsigned int nfds, struct __kernel_timespec __user *tsp, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); asmlinkage long compat_sys_signalfd4(int ufd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize, int flags); asmlinkage long compat_sys_newfstatat(unsigned int dfd, const char __user *filename, struct compat_stat __user *statbuf, int flag); asmlinkage long compat_sys_newfstat(unsigned int fd, struct compat_stat __user *statbuf); /* No generic prototype for sync_file_range and sync_file_range2 */ asmlinkage long compat_sys_waitid(int, compat_pid_t, struct compat_siginfo __user *, int, struct compat_rusage __user *); asmlinkage long compat_sys_set_robust_list(struct compat_robust_list_head __user *head, compat_size_t len); asmlinkage long compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, compat_size_t __user *len_ptr); asmlinkage long compat_sys_getitimer(int which, struct old_itimerval32 __user *it); asmlinkage long compat_sys_setitimer(int which, struct old_itimerval32 __user *in, struct old_itimerval32 __user *out); asmlinkage long compat_sys_kexec_load(compat_ulong_t entry, compat_ulong_t nr_segments, struct compat_kexec_segment __user *, compat_ulong_t flags); asmlinkage long compat_sys_timer_create(clockid_t which_clock, struct compat_sigevent __user *timer_event_spec, timer_t __user *created_timer_id); asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, compat_long_t addr, compat_long_t data); asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr); asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, compat_ulong_t __user *user_mask_ptr); asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, compat_stack_t __user *uoss_ptr); asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize); #ifndef CONFIG_ODD_RT_SIGACTION asmlinkage long compat_sys_rt_sigaction(int, const struct compat_sigaction __user *, struct compat_sigaction __user *, compat_size_t); #endif asmlinkage long compat_sys_rt_sigprocmask(int how, compat_sigset_t __user *set, compat_sigset_t __user *oset, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigpending(compat_sigset_t __user *uset, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigtimedwait_time32(compat_sigset_t __user *uthese, struct compat_siginfo __user *uinfo, struct old_timespec32 __user *uts, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigtimedwait_time64(compat_sigset_t __user *uthese, struct compat_siginfo __user *uinfo, struct __kernel_timespec __user *uts, compat_size_t sigsetsize); asmlinkage long compat_sys_rt_sigqueueinfo(compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); /* No generic prototype for rt_sigreturn */ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf); asmlinkage long compat_sys_getrlimit(unsigned int resource, struct compat_rlimit __user *rlim); asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim); asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru); asmlinkage long compat_sys_gettimeofday(struct old_timeval32 __user *tv, struct timezone __user *tz); asmlinkage long compat_sys_settimeofday(struct old_timeval32 __user *tv, struct timezone __user *tz); asmlinkage long compat_sys_sysinfo(struct compat_sysinfo __user *info); asmlinkage long compat_sys_mq_open(const char __user *u_name, int oflag, compat_mode_t mode, struct compat_mq_attr __user *u_attr); asmlinkage long compat_sys_mq_notify(mqd_t mqdes, const struct compat_sigevent __user *u_notification); asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes, const struct compat_mq_attr __user *u_mqstat, struct compat_mq_attr __user *u_omqstat); asmlinkage long compat_sys_msgctl(int first, int second, void __user *uptr); asmlinkage long compat_sys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, compat_long_t msgtyp, int msgflg); asmlinkage long compat_sys_msgsnd(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, int msgflg); asmlinkage long compat_sys_semctl(int semid, int semnum, int cmd, int arg); asmlinkage long compat_sys_shmctl(int first, int second, void __user *uptr); asmlinkage long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg); asmlinkage long compat_sys_recvfrom(int fd, void __user *buf, compat_size_t len, unsigned flags, struct sockaddr __user *addr, int __user *addrlen); asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags); asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags); /* No generic prototype for readahead */ asmlinkage long compat_sys_keyctl(u32 option, u32 arg2, u32 arg3, u32 arg4, u32 arg5); asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, const compat_uptr_t __user *envp); /* No generic prototype for fadvise64_64 */ /* CONFIG_MMU only */ asmlinkage long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, struct compat_siginfo __user *uinfo); asmlinkage long compat_sys_recvmmsg_time64(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags, struct __kernel_timespec __user *timeout); asmlinkage long compat_sys_recvmmsg_time32(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags, struct old_timespec32 __user *timeout); asmlinkage long compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, struct compat_rusage __user *ru); asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32, int, const char __user *); asmlinkage long compat_sys_open_by_handle_at(int mountdirfd, struct file_handle __user *handle, int flags); asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned vlen, unsigned int flags); asmlinkage long compat_sys_execveat(int dfd, const char __user *filename, const compat_uptr_t __user *argv, const compat_uptr_t __user *envp, int flags); asmlinkage ssize_t compat_sys_preadv2(compat_ulong_t fd, const struct iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags); asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd, const struct iovec __user *vec, compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags); #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 asmlinkage long compat_sys_preadv64v2(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags); #endif #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 asmlinkage long compat_sys_pwritev64v2(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags); #endif /* * Deprecated system calls which are still defined in * include/uapi/asm-generic/unistd.h and wanted by >= 1 arch */ /* __ARCH_WANT_SYSCALL_NO_AT */ asmlinkage long compat_sys_open(const char __user *filename, int flags, umode_t mode); /* __ARCH_WANT_SYSCALL_NO_FLAGS */ asmlinkage long compat_sys_signalfd(int ufd, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); /* __ARCH_WANT_SYSCALL_OFF_T */ asmlinkage long compat_sys_newstat(const char __user *filename, struct compat_stat __user *statbuf); asmlinkage long compat_sys_newlstat(const char __user *filename, struct compat_stat __user *statbuf); /* __ARCH_WANT_SYSCALL_DEPRECATED */ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp); asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u32); asmlinkage long compat_sys_recv(int fd, void __user *buf, compat_size_t len, unsigned flags); /* obsolete */ asmlinkage long compat_sys_old_readdir(unsigned int fd, struct compat_old_linux_dirent __user *, unsigned int count); /* obsolete */ asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg); /* obsolete */ asmlinkage long compat_sys_ipc(u32, int, int, u32, compat_uptr_t, u32); /* obsolete */ #ifdef __ARCH_WANT_SYS_SIGPENDING asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set); #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *nset, compat_old_sigset_t __user *oset); #endif #ifdef CONFIG_COMPAT_OLD_SIGACTION asmlinkage long compat_sys_sigaction(int sig, const struct compat_old_sigaction __user *act, struct compat_old_sigaction __user *oact); #endif /* obsolete */ asmlinkage long compat_sys_socketcall(int call, u32 __user *args); #ifdef __ARCH_WANT_COMPAT_TRUNCATE64 asmlinkage long compat_sys_truncate64(const char __user *pathname, compat_arg_u64(len)); #endif #ifdef __ARCH_WANT_COMPAT_FTRUNCATE64 asmlinkage long compat_sys_ftruncate64(unsigned int fd, compat_arg_u64(len)); #endif #ifdef __ARCH_WANT_COMPAT_FALLOCATE asmlinkage long compat_sys_fallocate(int fd, int mode, compat_arg_u64(offset), compat_arg_u64(len)); #endif #ifdef __ARCH_WANT_COMPAT_PREAD64 asmlinkage long compat_sys_pread64(unsigned int fd, char __user *buf, size_t count, compat_arg_u64(pos)); #endif #ifdef __ARCH_WANT_COMPAT_PWRITE64 asmlinkage long compat_sys_pwrite64(unsigned int fd, const char __user *buf, size_t count, compat_arg_u64(pos)); #endif #ifdef __ARCH_WANT_COMPAT_SYNC_FILE_RANGE asmlinkage long compat_sys_sync_file_range(int fd, compat_arg_u64(pos), compat_arg_u64(nbytes), unsigned int flags); #endif #ifdef __ARCH_WANT_COMPAT_FADVISE64_64 asmlinkage long compat_sys_fadvise64_64(int fd, compat_arg_u64(pos), compat_arg_u64(len), int advice); #endif #ifdef __ARCH_WANT_COMPAT_READAHEAD asmlinkage long compat_sys_readahead(int fd, compat_arg_u64(offset), size_t count); #endif #endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ /** * ns_to_old_timeval32 - Compat version of ns_to_timeval * @nsec: the nanoseconds value to be converted * * Returns the old_timeval32 representation of the nsec parameter. */ static inline struct old_timeval32 ns_to_old_timeval32(s64 nsec) { struct __kernel_old_timeval tv; struct old_timeval32 ctv; tv = ns_to_kernel_old_timeval(nsec); ctv.tv_sec = tv.tv_sec; ctv.tv_usec = tv.tv_usec; return ctv; } /* * Kernel code should not call compat syscalls (i.e., compat_sys_xyzyyz()) * directly. Instead, use one of the functions which work equivalently, such * as the kcompat_sys_xyzyyz() functions prototyped below. */ int kcompat_sys_statfs64(const char __user * pathname, compat_size_t sz, struct compat_statfs64 __user * buf); int kcompat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user * buf); #ifdef CONFIG_COMPAT /* * For most but not all architectures, "am I in a compat syscall?" and * "am I a compat task?" are the same question. For architectures on which * they aren't the same question, arch code can override in_compat_syscall. */ #ifndef in_compat_syscall static inline bool in_compat_syscall(void) { return is_compat_task(); } #endif #else /* !CONFIG_COMPAT */ #define is_compat_task() (0) /* Ensure no one redefines in_compat_syscall() under !CONFIG_COMPAT */ #define in_compat_syscall in_compat_syscall static inline bool in_compat_syscall(void) { return false; } #endif /* CONFIG_COMPAT */ #define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) #define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size); long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, unsigned long bitmap_size); /* * Some legacy ABIs like the i386 one use less than natural alignment for 64-bit * types, and will need special compat treatment for that. Most architectures * don't need that special handling even for compat syscalls. */ #ifndef compat_need_64bit_alignment_fixup #define compat_need_64bit_alignment_fixup() false #endif /* * A pointer passed in from user mode. This should not * be used for syscall parameters, just declare them * as pointers because the syscall entry code will have * appropriately converted them already. */ #ifndef compat_ptr static inline void __user *compat_ptr(compat_uptr_t uptr) { return (void __user *)(unsigned long)uptr; } #endif static inline compat_uptr_t ptr_to_compat(void __user *uptr) { return (u32)(unsigned long)uptr; } #endif /* _LINUX_COMPAT_H */
2 2 2 1 6 6 8 8 8 2 2 2 2 2 2 17 6 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004, 2020 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. */ #ifndef IB_VERBS_H #define IB_VERBS_H #include <linux/ethtool.h> #include <linux/types.h> #include <linux/device.h> #include <linux/dma-mapping.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/rwsem.h> #include <linux/workqueue.h> #include <linux/irq_poll.h> #include <uapi/linux/if_ether.h> #include <net/ipv6.h> #include <net/ip.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/refcount.h> #include <linux/if_link.h> #include <linux/atomic.h> #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/cgroup_rdma.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/dim.h> #include <uapi/rdma/ib_user_verbs.h> #include <rdma/rdma_counter.h> #include <rdma/restrack.h> #include <rdma/signature.h> #include <uapi/rdma/rdma_user_ioctl.h> #include <uapi/rdma/ib_user_ioctl_verbs.h> #include <linux/pci-tph.h> #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN struct ib_umem_odp; struct ib_uqp_object; struct ib_usrq_object; struct ib_uwq_object; struct rdma_cm_id; struct ib_port; struct hw_stats_device_data; extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; extern struct workqueue_struct *ib_comp_unbound_wq; struct ib_ucq_object; __printf(2, 3) __cold void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_alert(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_crit(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_err(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_warn(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else __printf(2, 3) __cold static inline void ibdev_dbg(const struct ib_device *ibdev, const char *format, ...) {} #endif #define ibdev_level_ratelimited(ibdev_level, ibdev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ ibdev_level(ibdev, fmt, ##__VA_ARGS__); \ } while (0) #define ibdev_emerg_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_emerg, ibdev, fmt, ##__VA_ARGS__) #define ibdev_alert_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_alert, ibdev, fmt, ##__VA_ARGS__) #define ibdev_crit_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_crit, ibdev, fmt, ##__VA_ARGS__) #define ibdev_err_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_err, ibdev, fmt, ##__VA_ARGS__) #define ibdev_warn_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_warn, ibdev, fmt, ##__VA_ARGS__) #define ibdev_notice_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_notice, ibdev, fmt, ##__VA_ARGS__) #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && __ratelimit(&_rs)) \ __dynamic_ibdev_dbg(&descriptor, ibdev, fmt, \ ##__VA_ARGS__); \ } while (0) #else __printf(2, 3) __cold static inline void ibdev_dbg_ratelimited(const struct ib_device *ibdev, const char *format, ...) {} #endif union ib_gid { u8 raw[16]; struct { __be64 subnet_prefix; __be64 interface_id; } global; }; extern union ib_gid zgid; enum ib_gid_type { IB_GID_TYPE_IB = IB_UVERBS_GID_TYPE_IB, IB_GID_TYPE_ROCE = IB_UVERBS_GID_TYPE_ROCE_V1, IB_GID_TYPE_ROCE_UDP_ENCAP = IB_UVERBS_GID_TYPE_ROCE_V2, IB_GID_TYPE_SIZE }; #define ROCE_V2_UDP_DPORT 4791 struct ib_gid_attr { struct net_device __rcu *ndev; struct ib_device *device; union ib_gid gid; enum ib_gid_type gid_type; u16 index; u32 port_num; }; enum { /* set the local administered indication */ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, }; enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP, RDMA_TRANSPORT_USNIC, RDMA_TRANSPORT_USNIC_UDP, RDMA_TRANSPORT_UNSPECIFIED, }; enum rdma_protocol_type { RDMA_PROTOCOL_IB, RDMA_PROTOCOL_IBOE, RDMA_PROTOCOL_IWARP, RDMA_PROTOCOL_USNIC_UDP }; __attribute_const__ enum rdma_transport_type rdma_node_get_transport(unsigned int node_type); enum rdma_network_type { RDMA_NETWORK_IB, RDMA_NETWORK_ROCE_V1, RDMA_NETWORK_IPV4, RDMA_NETWORK_IPV6 }; static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type) { if (network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) return IB_GID_TYPE_ROCE_UDP_ENCAP; else if (network_type == RDMA_NETWORK_ROCE_V1) return IB_GID_TYPE_ROCE; else return IB_GID_TYPE_IB; } static inline enum rdma_network_type rdma_gid_attr_network_type(const struct ib_gid_attr *attr) { if (attr->gid_type == IB_GID_TYPE_IB) return RDMA_NETWORK_IB; if (attr->gid_type == IB_GID_TYPE_ROCE) return RDMA_NETWORK_ROCE_V1; if (ipv6_addr_v4mapped((struct in6_addr *)&attr->gid)) return RDMA_NETWORK_IPV4; else return RDMA_NETWORK_IPV6; } enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, }; enum ib_device_cap_flags { IB_DEVICE_RESIZE_MAX_WR = IB_UVERBS_DEVICE_RESIZE_MAX_WR, IB_DEVICE_BAD_PKEY_CNTR = IB_UVERBS_DEVICE_BAD_PKEY_CNTR, IB_DEVICE_BAD_QKEY_CNTR = IB_UVERBS_DEVICE_BAD_QKEY_CNTR, IB_DEVICE_RAW_MULTI = IB_UVERBS_DEVICE_RAW_MULTI, IB_DEVICE_AUTO_PATH_MIG = IB_UVERBS_DEVICE_AUTO_PATH_MIG, IB_DEVICE_CHANGE_PHY_PORT = IB_UVERBS_DEVICE_CHANGE_PHY_PORT, IB_DEVICE_UD_AV_PORT_ENFORCE = IB_UVERBS_DEVICE_UD_AV_PORT_ENFORCE, IB_DEVICE_CURR_QP_STATE_MOD = IB_UVERBS_DEVICE_CURR_QP_STATE_MOD, IB_DEVICE_SHUTDOWN_PORT = IB_UVERBS_DEVICE_SHUTDOWN_PORT, /* IB_DEVICE_INIT_TYPE = IB_UVERBS_DEVICE_INIT_TYPE, (not in use) */ IB_DEVICE_PORT_ACTIVE_EVENT = IB_UVERBS_DEVICE_PORT_ACTIVE_EVENT, IB_DEVICE_SYS_IMAGE_GUID = IB_UVERBS_DEVICE_SYS_IMAGE_GUID, IB_DEVICE_RC_RNR_NAK_GEN = IB_UVERBS_DEVICE_RC_RNR_NAK_GEN, IB_DEVICE_SRQ_RESIZE = IB_UVERBS_DEVICE_SRQ_RESIZE, IB_DEVICE_N_NOTIFY_CQ = IB_UVERBS_DEVICE_N_NOTIFY_CQ, /* Reserved, old SEND_W_INV = 1 << 16,*/ IB_DEVICE_MEM_WINDOW = IB_UVERBS_DEVICE_MEM_WINDOW, /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB * messages and can verify the validity of checksum for * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ IB_DEVICE_UD_IP_CSUM = IB_UVERBS_DEVICE_UD_IP_CSUM, IB_DEVICE_XRC = IB_UVERBS_DEVICE_XRC, /* * This device supports the IB "base memory management extension", * which includes support for fast registrations (IB_WR_REG_MR, * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should * also be set by any iWarp device which must support FRs to comply * to the iWarp verbs spec. iWarp devices also support the * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the * stag. */ IB_DEVICE_MEM_MGT_EXTENSIONS = IB_UVERBS_DEVICE_MEM_MGT_EXTENSIONS, IB_DEVICE_MEM_WINDOW_TYPE_2A = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2A, IB_DEVICE_MEM_WINDOW_TYPE_2B = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2B, IB_DEVICE_RC_IP_CSUM = IB_UVERBS_DEVICE_RC_IP_CSUM, /* Deprecated. Please use IB_RAW_PACKET_CAP_IP_CSUM. */ IB_DEVICE_RAW_IP_CSUM = IB_UVERBS_DEVICE_RAW_IP_CSUM, IB_DEVICE_MANAGED_FLOW_STEERING = IB_UVERBS_DEVICE_MANAGED_FLOW_STEERING, /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */ IB_DEVICE_RAW_SCATTER_FCS = IB_UVERBS_DEVICE_RAW_SCATTER_FCS, /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, /* Placement type attributes */ IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL, IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT, IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; enum ib_kernel_cap_flags { /* * This device supports a per-device lkey or stag that can be * used without performing a memory registration for the local * memory. Note that ULPs should never check this flag, but * instead of use the local_dma_lkey flag in the ib_pd structure, * which will always contain a usable lkey. */ IBK_LOCAL_DMA_LKEY = 1 << 0, /* IB_QP_CREATE_INTEGRITY_EN is supported to implement T10-PI */ IBK_INTEGRITY_HANDOVER = 1 << 1, /* IB_ACCESS_ON_DEMAND is supported during reg_user_mr() */ IBK_ON_DEMAND_PAGING = 1 << 2, /* IB_MR_TYPE_SG_GAPS is supported */ IBK_SG_GAPS_REG = 1 << 3, /* Driver supports RDMA_NLDEV_CMD_DELLINK */ IBK_ALLOW_USER_UNREG = 1 << 4, /* ipoib will use IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK */ IBK_BLOCK_MULTICAST_LOOPBACK = 1 << 5, /* iopib will use IB_QP_CREATE_IPOIB_UD_LSO for its QPs */ IBK_UD_TSO = 1 << 6, /* iopib will use the device ops: * get_vf_config * get_vf_guid * get_vf_stats * set_vf_guid * set_vf_link_state */ IBK_VIRTUAL_FUNCTION = 1 << 7, /* ipoib will use IB_QP_CREATE_NETDEV_USE for its QPs */ IBK_RDMA_NETDEV_OPA = 1 << 8, }; enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, IB_ATOMIC_GLOB }; enum ib_odp_general_cap_bits { IB_ODP_SUPPORT = IB_UVERBS_ODP_SUPPORT, IB_ODP_SUPPORT_IMPLICIT = IB_UVERBS_ODP_SUPPORT_IMPLICIT, }; enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_SEND = IB_UVERBS_ODP_SUPPORT_SEND, IB_ODP_SUPPORT_RECV = IB_UVERBS_ODP_SUPPORT_RECV, IB_ODP_SUPPORT_WRITE = IB_UVERBS_ODP_SUPPORT_WRITE, IB_ODP_SUPPORT_READ = IB_UVERBS_ODP_SUPPORT_READ, IB_ODP_SUPPORT_ATOMIC = IB_UVERBS_ODP_SUPPORT_ATOMIC, IB_ODP_SUPPORT_SRQ_RECV = IB_UVERBS_ODP_SUPPORT_SRQ_RECV, IB_ODP_SUPPORT_FLUSH = IB_UVERBS_ODP_SUPPORT_FLUSH, IB_ODP_SUPPORT_ATOMIC_WRITE = IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE, }; struct ib_odp_caps { uint64_t general_caps; struct { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; uint32_t xrc_odp_caps; } per_transport_caps; }; struct ib_rss_caps { /* Corresponding bit will be set if qp type from * 'enum ib_qp_type' is supported, e.g. * supported_qpts |= 1 << IB_QPT_UD */ u32 supported_qpts; u32 max_rwq_indirection_tables; u32 max_rwq_indirection_table_size; }; enum ib_tm_cap_flags { /* Support tag matching with rendezvous offload for RC transport */ IB_TM_CAP_RNDV_RC = 1 << 0, }; struct ib_tm_caps { /* Max size of RNDV header */ u32 max_rndv_hdr_size; /* Max number of entries in tag matching list */ u32 max_num_tags; /* From enum ib_tm_cap_flags */ u32 flags; /* Max number of outstanding list operations */ u32 max_ops; /* Max number of SGE in tag matching entry */ u32 max_sge; }; struct ib_cq_init_attr { unsigned int cqe; u32 comp_vector; u32 flags; }; enum ib_cq_attr_mask { IB_CQ_MODERATE = 1 << 0, }; struct ib_cq_caps { u16 max_cq_moderation_count; u16 max_cq_moderation_period; }; struct ib_dm_mr_attr { u64 length; u64 offset; u32 access_flags; }; struct ib_dm_alloc_attr { u64 length; u32 alignment; u32 flags; }; struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; u64 max_mr_size; u64 page_size_cap; u32 vendor_id; u32 vendor_part_id; u32 hw_ver; int max_qp; int max_qp_wr; u64 device_cap_flags; u64 kernel_cap_flags; int max_send_sge; int max_recv_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ib_atomic_cap atomic_cap; enum ib_atomic_cap masked_atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_srq; int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; unsigned int max_pi_fast_reg_page_list_len; u16 max_pkeys; u8 local_ca_ack_delay; int sig_prot_cap; int sig_guard_cap; struct ib_odp_caps odp_caps; uint64_t timestamp_mask; uint64_t hca_core_clock; /* in KHZ */ struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ struct ib_tm_caps tm_caps; struct ib_cq_caps cq_caps; u64 max_dm_size; /* Max entries for sgl for optimized performance per READ */ u32 max_sgl_rd; }; enum ib_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; enum opa_mtu { OPA_MTU_8192 = 6, OPA_MTU_10240 = 7 }; static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } static inline enum ib_mtu ib_mtu_int_to_enum(int mtu) { if (mtu >= 4096) return IB_MTU_4096; else if (mtu >= 2048) return IB_MTU_2048; else if (mtu >= 1024) return IB_MTU_1024; else if (mtu >= 512) return IB_MTU_512; else return IB_MTU_256; } static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) { switch (mtu) { case OPA_MTU_8192: return 8192; case OPA_MTU_10240: return 10240; default: return(ib_mtu_enum_to_int((enum ib_mtu)mtu)); } } static inline enum opa_mtu opa_mtu_int_to_enum(int mtu) { if (mtu >= 10240) return OPA_MTU_10240; else if (mtu >= 8192) return OPA_MTU_8192; else return ((enum opa_mtu)ib_mtu_int_to_enum(mtu)); } enum ib_port_state { IB_PORT_NOP = 0, IB_PORT_DOWN = 1, IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, IB_PORT_ACTIVE_DEFER = 5 }; static inline const char *__attribute_const__ ib_port_state_to_str(enum ib_port_state state) { const char * const states[] = { [IB_PORT_NOP] = "NOP", [IB_PORT_DOWN] = "DOWN", [IB_PORT_INIT] = "INIT", [IB_PORT_ARMED] = "ARMED", [IB_PORT_ACTIVE] = "ACTIVE", [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER", }; if (state < ARRAY_SIZE(states)) return states[state]; return "UNKNOWN"; } enum ib_port_phys_state { IB_PORT_PHYS_STATE_SLEEP = 1, IB_PORT_PHYS_STATE_POLLING = 2, IB_PORT_PHYS_STATE_DISABLED = 3, IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING = 4, IB_PORT_PHYS_STATE_LINK_UP = 5, IB_PORT_PHYS_STATE_LINK_ERROR_RECOVERY = 6, IB_PORT_PHYS_STATE_PHY_TEST = 7, }; enum ib_port_width { IB_WIDTH_1X = 1, IB_WIDTH_2X = 16, IB_WIDTH_4X = 2, IB_WIDTH_8X = 4, IB_WIDTH_12X = 8 }; static inline int ib_width_enum_to_int(enum ib_port_width width) { switch (width) { case IB_WIDTH_1X: return 1; case IB_WIDTH_2X: return 2; case IB_WIDTH_4X: return 4; case IB_WIDTH_8X: return 8; case IB_WIDTH_12X: return 12; default: return -1; } } enum ib_port_speed { IB_SPEED_SDR = 1, IB_SPEED_DDR = 2, IB_SPEED_QDR = 4, IB_SPEED_FDR10 = 8, IB_SPEED_FDR = 16, IB_SPEED_EDR = 32, IB_SPEED_HDR = 64, IB_SPEED_NDR = 128, IB_SPEED_XDR = 256, }; enum ib_stat_flag { IB_STAT_FLAG_OPTIONAL = 1 << 0, }; /** * struct rdma_stat_desc - description of one rdma stat/counter * @name: The name of the counter * @flags: Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL * @priv: Driver private information; Core code should not use */ struct rdma_stat_desc { const char *name; unsigned int flags; const void *priv; }; /** * struct rdma_hw_stats - collection of hardware stats and their management * @lock: Mutex to protect parallel write access to lifespan and values * of counters, which are 64bits and not guaranteed to be written * atomicaly on 32bits systems. * @timestamp: Used by the core code to track when the last update was * @lifespan: Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults * to 10 milliseconds, drivers can override the default be specifying * their own value during their allocation routine. * @descs: Array of pointers to static descriptors used for the counters * in directory. * @is_disabled: A bitmap to indicate each counter is currently disabled * or not. * @num_counters: How many hardware counters there are. If name is * shorter than this number, a kernel oops will result. Driver authors * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) * in their code to prevent this. * @value: Array of u64 counters that are accessed by the sysfs code and * filled in by the drivers get_stats routine */ struct rdma_hw_stats { struct mutex lock; /* Protect lifespan and values[] */ unsigned long timestamp; unsigned long lifespan; const struct rdma_stat_desc *descs; unsigned long *is_disabled; int num_counters; u64 value[] __counted_by(num_counters); }; #define RDMA_HW_STATS_DEFAULT_LIFESPAN 10 struct rdma_hw_stats *rdma_alloc_hw_stats_struct( const struct rdma_stat_desc *descs, int num_counters, unsigned long lifespan); void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats); /* Define bits for the various functionality this port needs to be supported by * the core. */ /* Management 0x00000FFF */ #define RDMA_CORE_CAP_IB_MAD 0x00000001 #define RDMA_CORE_CAP_IB_SMI 0x00000002 #define RDMA_CORE_CAP_IB_CM 0x00000004 #define RDMA_CORE_CAP_IW_CM 0x00000008 #define RDMA_CORE_CAP_IB_SA 0x00000010 #define RDMA_CORE_CAP_OPA_MAD 0x00000020 /* Address format 0x000FF000 */ #define RDMA_CORE_CAP_AF_IB 0x00001000 #define RDMA_CORE_CAP_ETH_AH 0x00002000 #define RDMA_CORE_CAP_OPA_AH 0x00004000 #define RDMA_CORE_CAP_IB_GRH_REQUIRED 0x00008000 /* Protocol 0xFFF00000 */ #define RDMA_CORE_CAP_PROT_IB 0x00100000 #define RDMA_CORE_CAP_PROT_ROCE 0x00200000 #define RDMA_CORE_CAP_PROT_IWARP 0x00400000 #define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 #define RDMA_CORE_CAP_PROT_RAW_PACKET 0x01000000 #define RDMA_CORE_CAP_PROT_USNIC 0x02000000 #define RDMA_CORE_PORT_IB_GRH_REQUIRED (RDMA_CORE_CAP_IB_GRH_REQUIRED \ | RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP) #define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_SMI \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_IB_SA \ | RDMA_CORE_CAP_AF_IB) #define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ | RDMA_CORE_CAP_IW_CM) #define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ | RDMA_CORE_CAP_OPA_MAD) #define RDMA_CORE_PORT_RAW_PACKET (RDMA_CORE_CAP_PROT_RAW_PACKET) #define RDMA_CORE_PORT_USNIC (RDMA_CORE_CAP_PROT_USNIC) struct ib_port_attr { u64 subnet_prefix; enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; u32 phys_mtu; int gid_tbl_len; unsigned int ip_gids:1; /* This is the value from PortInfo CapabilityMask, defined by IBA */ u32 port_cap_flags; u32 max_msg_sz; u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; u32 sm_lid; u32 lid; u8 lmc; u8 max_vl_num; u8 sm_sl; u8 subnet_timeout; u8 init_type_reply; u8 active_width; u16 active_speed; u8 phys_state; u16 port_cap_flags2; }; enum ib_device_modify_flags { IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 }; #define IB_DEVICE_NODE_DESC_MAX 64 struct ib_device_modify { u64 sys_image_guid; char node_desc[IB_DEVICE_NODE_DESC_MAX]; }; enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), IB_PORT_RESET_QKEY_CNTR = (1<<3), IB_PORT_OPA_MASK_CHG = (1<<4) }; struct ib_port_modify { u32 set_port_cap_mask; u32 clr_port_cap_mask; u8 init_type; }; enum ib_event_type { IB_EVENT_CQ_ERR, IB_EVENT_QP_FATAL, IB_EVENT_QP_REQ_ERR, IB_EVENT_QP_ACCESS_ERR, IB_EVENT_COMM_EST, IB_EVENT_SQ_DRAINED, IB_EVENT_PATH_MIG, IB_EVENT_PATH_MIG_ERR, IB_EVENT_DEVICE_FATAL, IB_EVENT_PORT_ACTIVE, IB_EVENT_PORT_ERR, IB_EVENT_LID_CHANGE, IB_EVENT_PKEY_CHANGE, IB_EVENT_SM_CHANGE, IB_EVENT_SRQ_ERR, IB_EVENT_SRQ_LIMIT_REACHED, IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); struct ib_event { struct ib_device *device; union { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; struct ib_wq *wq; u32 port_num; } element; enum ib_event_type event; }; struct ib_event_handler { struct ib_device *device; void (*handler)(struct ib_event_handler *, struct ib_event *); struct list_head list; }; #define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ do { \ (_ptr)->device = _device; \ (_ptr)->handler = _handler; \ INIT_LIST_HEAD(&(_ptr)->list); \ } while (0) struct ib_global_route { const struct ib_gid_attr *sgid_attr; union ib_gid dgid; u32 flow_label; u8 sgid_index; u8 hop_limit; u8 traffic_class; }; struct ib_grh { __be32 version_tclass_flow; __be16 paylen; u8 next_hdr; u8 hop_limit; union ib_gid sgid; union ib_gid dgid; }; union rdma_network_hdr { struct ib_grh ibgrh; struct { /* The IB spec states that if it's IPv4, the header * is located in the last 20 bytes of the header. */ u8 reserved[20]; struct iphdr roce4grh; }; }; #define IB_QPN_MASK 0xFFFFFF enum { IB_MULTICAST_QPN = 0xffffff }; #define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) #define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000) enum ib_ah_flags { IB_AH_GRH = 1 }; enum ib_rate { IB_RATE_PORT_CURRENT = 0, IB_RATE_2_5_GBPS = 2, IB_RATE_5_GBPS = 5, IB_RATE_10_GBPS = 3, IB_RATE_20_GBPS = 6, IB_RATE_30_GBPS = 4, IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, IB_RATE_120_GBPS = 10, IB_RATE_14_GBPS = 11, IB_RATE_56_GBPS = 12, IB_RATE_112_GBPS = 13, IB_RATE_168_GBPS = 14, IB_RATE_25_GBPS = 15, IB_RATE_100_GBPS = 16, IB_RATE_200_GBPS = 17, IB_RATE_300_GBPS = 18, IB_RATE_28_GBPS = 19, IB_RATE_50_GBPS = 20, IB_RATE_400_GBPS = 21, IB_RATE_600_GBPS = 22, IB_RATE_800_GBPS = 23, IB_RATE_1600_GBPS = 25, }; /** * ib_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); /** * ib_rate_to_mbps - Convert the IB rate enum to Mbps. * For example, IB_RATE_2_5_GBPS will be converted to 2500. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); /** * enum ib_mr_type - memory region type * @IB_MR_TYPE_MEM_REG: memory region that is used for * normal registration * @IB_MR_TYPE_SG_GAPS: memory region that is capable to * register any arbitrary sg lists (without * the normal mr constraints - see * ib_map_mr_sg) * @IB_MR_TYPE_DM: memory region that is used for device * memory registration * @IB_MR_TYPE_USER: memory region that is used for the user-space * application * @IB_MR_TYPE_DMA: memory region that is used for DMA operations * without address translations (VA=PA) * @IB_MR_TYPE_INTEGRITY: memory region that is used for * data integrity operations */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, IB_MR_TYPE_SG_GAPS, IB_MR_TYPE_DM, IB_MR_TYPE_USER, IB_MR_TYPE_DMA, IB_MR_TYPE_INTEGRITY, }; enum ib_mr_status_check { IB_MR_CHECK_SIG_STATUS = 1, }; /** * struct ib_mr_status - Memory region status container * * @fail_status: Bitmask of MR checks status. For each * failed check a corresponding status bit is set. * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS * failure. */ struct ib_mr_status { u32 fail_status; struct ib_sig_err sig_err; }; /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. * @mult: multiple to convert. */ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult); struct rdma_ah_init_attr { struct rdma_ah_attr *ah_attr; u32 flags; struct net_device *xmit_slave; }; enum rdma_ah_attr_type { RDMA_AH_ATTR_TYPE_UNDEFINED, RDMA_AH_ATTR_TYPE_IB, RDMA_AH_ATTR_TYPE_ROCE, RDMA_AH_ATTR_TYPE_OPA, }; struct ib_ah_attr { u16 dlid; u8 src_path_bits; }; struct roce_ah_attr { u8 dmac[ETH_ALEN]; }; struct opa_ah_attr { u32 dlid; u8 src_path_bits; bool make_grd; }; struct rdma_ah_attr { struct ib_global_route grh; u8 sl; u8 static_rate; u32 port_num; u8 ah_flags; enum rdma_ah_attr_type type; union { struct ib_ah_attr ib; struct roce_ah_attr roce; struct opa_ah_attr opa; }; }; enum ib_wc_status { IB_WC_SUCCESS, IB_WC_LOC_LEN_ERR, IB_WC_LOC_QP_OP_ERR, IB_WC_LOC_EEC_OP_ERR, IB_WC_LOC_PROT_ERR, IB_WC_WR_FLUSH_ERR, IB_WC_MW_BIND_ERR, IB_WC_BAD_RESP_ERR, IB_WC_LOC_ACCESS_ERR, IB_WC_REM_INV_REQ_ERR, IB_WC_REM_ACCESS_ERR, IB_WC_REM_OP_ERR, IB_WC_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR, IB_WC_LOC_RDD_VIOL_ERR, IB_WC_REM_INV_RD_REQ_ERR, IB_WC_REM_ABORT_ERR, IB_WC_INV_EECN_ERR, IB_WC_INV_EEC_STATE_ERR, IB_WC_FATAL_ERR, IB_WC_RESP_TIMEOUT_ERR, IB_WC_GENERAL_ERR }; const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status); enum ib_wc_opcode { IB_WC_SEND = IB_UVERBS_WC_SEND, IB_WC_RDMA_WRITE = IB_UVERBS_WC_RDMA_WRITE, IB_WC_RDMA_READ = IB_UVERBS_WC_RDMA_READ, IB_WC_COMP_SWAP = IB_UVERBS_WC_COMP_SWAP, IB_WC_FETCH_ADD = IB_UVERBS_WC_FETCH_ADD, IB_WC_BIND_MW = IB_UVERBS_WC_BIND_MW, IB_WC_LOCAL_INV = IB_UVERBS_WC_LOCAL_INV, IB_WC_LSO = IB_UVERBS_WC_TSO, IB_WC_ATOMIC_WRITE = IB_UVERBS_WC_ATOMIC_WRITE, IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, IB_WC_FLUSH = IB_UVERBS_WC_FLUSH, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). */ IB_WC_RECV = 1 << 7, IB_WC_RECV_RDMA_WITH_IMM }; enum ib_wc_flags { IB_WC_GRH = 1, IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), IB_WC_IP_CSUM_OK = (1<<3), IB_WC_WITH_SMAC = (1<<4), IB_WC_WITH_VLAN = (1<<5), IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6), }; struct ib_wc { union { u64 wr_id; struct ib_cqe *wr_cqe; }; enum ib_wc_status status; enum ib_wc_opcode opcode; u32 vendor_err; u32 byte_len; struct ib_qp *qp; union { __be32 imm_data; u32 invalidate_rkey; } ex; u32 src_qp; u32 slid; int wc_flags; u16 pkey_index; u8 sl; u8 dlid_path_bits; u32 port_num; /* valid only for DR SMPs on switches */ u8 smac[ETH_ALEN]; u16 vlan_id; u8 network_hdr_type; }; enum ib_cq_notify_flags { IB_CQ_SOLICITED = 1 << 0, IB_CQ_NEXT_COMP = 1 << 1, IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP, IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, }; enum ib_srq_type { IB_SRQT_BASIC = IB_UVERBS_SRQT_BASIC, IB_SRQT_XRC = IB_UVERBS_SRQT_XRC, IB_SRQT_TM = IB_UVERBS_SRQT_TM, }; static inline bool ib_srq_has_cq(enum ib_srq_type srq_type) { return srq_type == IB_SRQT_XRC || srq_type == IB_SRQT_TM; } enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, }; struct ib_srq_attr { u32 max_wr; u32 max_sge; u32 srq_limit; }; struct ib_srq_init_attr { void (*event_handler)(struct ib_event *, void *); void *srq_context; struct ib_srq_attr attr; enum ib_srq_type srq_type; struct { struct ib_cq *cq; union { struct { struct ib_xrcd *xrcd; } xrc; struct { u32 max_num_tags; } tag_matching; }; } ext; }; struct ib_qp_cap { u32 max_send_wr; u32 max_recv_wr; u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; /* * Maximum number of rdma_rw_ctx structures in flight at a time. * ib_create_qp() will calculate the right amount of needed WRs * and MRs based on this. */ u32 max_rdma_ctxs; }; enum ib_sig_type { IB_SIGNAL_ALL_WR, IB_SIGNAL_REQ_WR }; enum ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ IB_QPT_SMI, IB_QPT_GSI, IB_QPT_RC = IB_UVERBS_QPT_RC, IB_QPT_UC = IB_UVERBS_QPT_UC, IB_QPT_UD = IB_UVERBS_QPT_UD, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, IB_QPT_RAW_PACKET = IB_UVERBS_QPT_RAW_PACKET, IB_QPT_XRC_INI = IB_UVERBS_QPT_XRC_INI, IB_QPT_XRC_TGT = IB_UVERBS_QPT_XRC_TGT, IB_QPT_MAX, IB_QPT_DRIVER = IB_UVERBS_QPT_DRIVER, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the * IB_QPT_MAX usages should not be affected in the core layer */ IB_QPT_RESERVED1 = 0x1000, IB_QPT_RESERVED2, IB_QPT_RESERVED3, IB_QPT_RESERVED4, IB_QPT_RESERVED5, IB_QPT_RESERVED6, IB_QPT_RESERVED7, IB_QPT_RESERVED8, IB_QPT_RESERVED9, IB_QPT_RESERVED10, }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, IB_QP_CREATE_MANAGED_SEND = 1 << 3, IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_INTEGRITY_EN = 1 << 6, IB_QP_CREATE_NETDEV_USE = 1 << 7, IB_QP_CREATE_SCATTER_FCS = IB_UVERBS_QP_CREATE_SCATTER_FCS, IB_QP_CREATE_CVLAN_STRIPPING = IB_UVERBS_QP_CREATE_CVLAN_STRIPPING, IB_QP_CREATE_SOURCE_QPN = 1 << 10, IB_QP_CREATE_PCI_WRITE_END_PADDING = IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, }; /* * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler * callback to destroy the passed in QP. */ struct ib_qp_init_attr { /* This callback occurs in workqueue context */ void (*event_handler)(struct ib_event *, void *); void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; u32 create_flags; /* * Only needed for special QP types, or when using the RW API. */ u32 port_num; struct ib_rwq_ind_table *rwq_ind_tbl; u32 source_qpn; }; struct ib_qp_open_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; enum ib_qp_type qp_type; }; enum ib_rnr_timeout { IB_RNR_TIMER_655_36 = 0, IB_RNR_TIMER_000_01 = 1, IB_RNR_TIMER_000_02 = 2, IB_RNR_TIMER_000_03 = 3, IB_RNR_TIMER_000_04 = 4, IB_RNR_TIMER_000_06 = 5, IB_RNR_TIMER_000_08 = 6, IB_RNR_TIMER_000_12 = 7, IB_RNR_TIMER_000_16 = 8, IB_RNR_TIMER_000_24 = 9, IB_RNR_TIMER_000_32 = 10, IB_RNR_TIMER_000_48 = 11, IB_RNR_TIMER_000_64 = 12, IB_RNR_TIMER_000_96 = 13, IB_RNR_TIMER_001_28 = 14, IB_RNR_TIMER_001_92 = 15, IB_RNR_TIMER_002_56 = 16, IB_RNR_TIMER_003_84 = 17, IB_RNR_TIMER_005_12 = 18, IB_RNR_TIMER_007_68 = 19, IB_RNR_TIMER_010_24 = 20, IB_RNR_TIMER_015_36 = 21, IB_RNR_TIMER_020_48 = 22, IB_RNR_TIMER_030_72 = 23, IB_RNR_TIMER_040_96 = 24, IB_RNR_TIMER_061_44 = 25, IB_RNR_TIMER_081_92 = 26, IB_RNR_TIMER_122_88 = 27, IB_RNR_TIMER_163_84 = 28, IB_RNR_TIMER_245_76 = 29, IB_RNR_TIMER_327_68 = 30, IB_RNR_TIMER_491_52 = 31 }; enum ib_qp_attr_mask { IB_QP_STATE = 1, IB_QP_CUR_STATE = (1<<1), IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2), IB_QP_ACCESS_FLAGS = (1<<3), IB_QP_PKEY_INDEX = (1<<4), IB_QP_PORT = (1<<5), IB_QP_QKEY = (1<<6), IB_QP_AV = (1<<7), IB_QP_PATH_MTU = (1<<8), IB_QP_TIMEOUT = (1<<9), IB_QP_RETRY_CNT = (1<<10), IB_QP_RNR_RETRY = (1<<11), IB_QP_RQ_PSN = (1<<12), IB_QP_MAX_QP_RD_ATOMIC = (1<<13), IB_QP_ALT_PATH = (1<<14), IB_QP_MIN_RNR_TIMER = (1<<15), IB_QP_SQ_PSN = (1<<16), IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), IB_QP_RESERVED1 = (1<<21), IB_QP_RESERVED2 = (1<<22), IB_QP_RESERVED3 = (1<<23), IB_QP_RESERVED4 = (1<<24), IB_QP_RATE_LIMIT = (1<<25), IB_QP_ATTR_STANDARD_BITS = GENMASK(20, 0), }; enum ib_qp_state { IB_QPS_RESET, IB_QPS_INIT, IB_QPS_RTR, IB_QPS_RTS, IB_QPS_SQD, IB_QPS_SQE, IB_QPS_ERR }; enum ib_mig_state { IB_MIG_MIGRATED, IB_MIG_REARM, IB_MIG_ARMED }; enum ib_mw_type { IB_MW_TYPE_1 = 1, IB_MW_TYPE_2 = 2 }; struct ib_qp_attr { enum ib_qp_state qp_state; enum ib_qp_state cur_qp_state; enum ib_mtu path_mtu; enum ib_mig_state path_mig_state; u32 qkey; u32 rq_psn; u32 sq_psn; u32 dest_qp_num; int qp_access_flags; struct ib_qp_cap cap; struct rdma_ah_attr ah_attr; struct rdma_ah_attr alt_ah_attr; u16 pkey_index; u16 alt_pkey_index; u8 en_sqd_async_notify; u8 sq_draining; u8 max_rd_atomic; u8 max_dest_rd_atomic; u8 min_rnr_timer; u32 port_num; u8 timeout; u8 retry_cnt; u8 rnr_retry; u32 alt_port_num; u8 alt_timeout; u32 rate_limit; struct net_device *xmit_slave; }; enum ib_wr_opcode { /* These are shared with userspace */ IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE, IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM, IB_WR_SEND = IB_UVERBS_WR_SEND, IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM, IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ, IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD, IB_WR_BIND_MW = IB_UVERBS_WR_BIND_MW, IB_WR_LSO = IB_UVERBS_WR_TSO, IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV, IB_WR_MASKED_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, IB_WR_FLUSH = IB_UVERBS_WR_FLUSH, IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE, /* These are kernel only and can not be issued by userspace */ IB_WR_REG_MR = 0x20, IB_WR_REG_MR_INTEGRITY, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ IB_WR_RESERVED1 = 0xf0, IB_WR_RESERVED2, IB_WR_RESERVED3, IB_WR_RESERVED4, IB_WR_RESERVED5, IB_WR_RESERVED6, IB_WR_RESERVED7, IB_WR_RESERVED8, IB_WR_RESERVED9, IB_WR_RESERVED10, }; enum ib_send_flags { IB_SEND_FENCE = 1, IB_SEND_SIGNALED = (1<<1), IB_SEND_SOLICITED = (1<<2), IB_SEND_INLINE = (1<<3), IB_SEND_IP_CSUM = (1<<4), /* reserve bits 26-31 for low level drivers' internal use */ IB_SEND_RESERVED_START = (1 << 26), IB_SEND_RESERVED_END = (1 << 31), }; struct ib_sge { u64 addr; u32 length; u32 lkey; }; struct ib_cqe { void (*done)(struct ib_cq *cq, struct ib_wc *wc); }; struct ib_send_wr { struct ib_send_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; enum ib_wr_opcode opcode; int send_flags; union { __be32 imm_data; u32 invalidate_rkey; } ex; }; struct ib_rdma_wr { struct ib_send_wr wr; u64 remote_addr; u32 rkey; }; static inline const struct ib_rdma_wr *rdma_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_rdma_wr, wr); } struct ib_atomic_wr { struct ib_send_wr wr; u64 remote_addr; u64 compare_add; u64 swap; u64 compare_add_mask; u64 swap_mask; u32 rkey; }; static inline const struct ib_atomic_wr *atomic_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_atomic_wr, wr); } struct ib_ud_wr { struct ib_send_wr wr; struct ib_ah *ah; void *header; int hlen; int mss; u32 remote_qpn; u32 remote_qkey; u16 pkey_index; /* valid for GSI only */ u32 port_num; /* valid for DR SMPs on switch only */ }; static inline const struct ib_ud_wr *ud_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_ud_wr, wr); } struct ib_reg_wr { struct ib_send_wr wr; struct ib_mr *mr; u32 key; int access; }; static inline const struct ib_reg_wr *reg_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_reg_wr, wr); } struct ib_recv_wr { struct ib_recv_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; }; enum ib_access_flags { IB_ACCESS_LOCAL_WRITE = IB_UVERBS_ACCESS_LOCAL_WRITE, IB_ACCESS_REMOTE_WRITE = IB_UVERBS_ACCESS_REMOTE_WRITE, IB_ACCESS_REMOTE_READ = IB_UVERBS_ACCESS_REMOTE_READ, IB_ACCESS_REMOTE_ATOMIC = IB_UVERBS_ACCESS_REMOTE_ATOMIC, IB_ACCESS_MW_BIND = IB_UVERBS_ACCESS_MW_BIND, IB_ZERO_BASED = IB_UVERBS_ACCESS_ZERO_BASED, IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND, IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB, IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING, IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL, IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT, IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE, IB_ACCESS_SUPPORTED = ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL, }; /* * XXX: these are apparently used for ->rereg_user_mr, no idea why they * are hidden here instead of a uapi header! */ enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), IB_MR_REREG_ACCESS = (1<<2), IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; struct ib_umem; enum rdma_remove_reason { /* * Userspace requested uobject deletion or initial try * to remove uobject via cleanup. Call could fail */ RDMA_REMOVE_DESTROY, /* Context deletion. This call should delete the actual object itself */ RDMA_REMOVE_CLOSE, /* Driver is being hot-unplugged. This call should delete the actual object itself */ RDMA_REMOVE_DRIVER_REMOVE, /* uobj is being cleaned-up before being committed */ RDMA_REMOVE_ABORT, /* The driver failed to destroy the uobject and is being disconnected */ RDMA_REMOVE_DRIVER_FAILURE, }; struct ib_rdmacg_object { #ifdef CONFIG_CGROUP_RDMA struct rdma_cgroup *cg; /* owner rdma cgroup */ #endif }; struct ib_ucontext { struct ib_device *device; struct ib_uverbs_file *ufile; struct ib_rdmacg_object cg_obj; u64 enabled_caps; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; struct xarray mmap_xa; }; struct ib_uobject { u64 user_handle; /* handle given to us by userspace */ /* ufile & ucontext owning this object */ struct ib_uverbs_file *ufile; /* FIXME, save memory: ufile->context == context */ struct ib_ucontext *context; /* associated user context */ void *object; /* containing object */ struct list_head list; /* link to context's list */ struct ib_rdmacg_object cg_obj; /* rdmacg object */ int id; /* index into kernel idr */ struct kref ref; atomic_t usecnt; /* protects exclusive access */ struct rcu_head rcu; /* kfree_rcu() overhead */ const struct uverbs_api_object *uapi_object; }; struct ib_udata { const void __user *inbuf; void __user *outbuf; size_t inlen; size_t outlen; }; struct ib_pd { u32 local_dma_lkey; u32 flags; struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all resources */ u32 unsafe_global_rkey; /* * Implementation details of the RDMA core, don't use in drivers: */ struct ib_mr *__internal_mr; struct rdma_restrack_entry res; }; struct ib_xrcd { struct ib_device *device; atomic_t usecnt; /* count all exposed resources */ struct inode *inode; struct rw_semaphore tgt_qps_rwsem; struct xarray tgt_qps; }; struct ib_ah { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; const struct ib_gid_attr *sgid_attr; enum rdma_ah_attr_type type; }; typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { IB_POLL_SOFTIRQ, /* poll from softirq context */ IB_POLL_WORKQUEUE, /* poll from workqueue */ IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE, IB_POLL_DIRECT, /* caller context, no hw completions */ }; struct ib_cq { struct ib_device *device; struct ib_ucq_object *uobject; ib_comp_handler comp_handler; void (*event_handler)(struct ib_event *, void *); void *cq_context; int cqe; unsigned int cqe_used; atomic_t usecnt; /* count number of work queues */ enum ib_poll_context poll_ctx; struct ib_wc *wc; struct list_head pool_entry; union { struct irq_poll iop; struct work_struct work; }; struct workqueue_struct *comp_wq; struct dim *dim; /* updated only by trace points */ ktime_t timestamp; u8 interrupt:1; u8 shared:1; unsigned int comp_vector; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; struct ib_srq { struct ib_device *device; struct ib_pd *pd; struct ib_usrq_object *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; enum ib_srq_type srq_type; atomic_t usecnt; struct { struct ib_cq *cq; union { struct { struct ib_xrcd *xrcd; u32 srq_num; } xrc; }; } ext; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; enum ib_raw_packet_caps { /* * Strip cvlan from incoming packet and report it in the matching work * completion is supported. */ IB_RAW_PACKET_CAP_CVLAN_STRIPPING = IB_UVERBS_RAW_PACKET_CAP_CVLAN_STRIPPING, /* * Scatter FCS field of an incoming packet to host memory is supported. */ IB_RAW_PACKET_CAP_SCATTER_FCS = IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS, /* Checksum offloads are supported (for both send and receive). */ IB_RAW_PACKET_CAP_IP_CSUM = IB_UVERBS_RAW_PACKET_CAP_IP_CSUM, /* * When a packet is received for an RQ with no receive WQEs, the * packet processing is delayed. */ IB_RAW_PACKET_CAP_DELAY_DROP = IB_UVERBS_RAW_PACKET_CAP_DELAY_DROP, }; enum ib_wq_type { IB_WQT_RQ = IB_UVERBS_WQT_RQ, }; enum ib_wq_state { IB_WQS_RESET, IB_WQS_RDY, IB_WQS_ERR }; struct ib_wq { struct ib_device *device; struct ib_uwq_object *uobject; void *wq_context; void (*event_handler)(struct ib_event *, void *); struct ib_pd *pd; struct ib_cq *cq; u32 wq_num; enum ib_wq_state state; enum ib_wq_type wq_type; atomic_t usecnt; }; enum ib_wq_flags { IB_WQ_FLAGS_CVLAN_STRIPPING = IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING, IB_WQ_FLAGS_SCATTER_FCS = IB_UVERBS_WQ_FLAGS_SCATTER_FCS, IB_WQ_FLAGS_DELAY_DROP = IB_UVERBS_WQ_FLAGS_DELAY_DROP, IB_WQ_FLAGS_PCI_WRITE_END_PADDING = IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING, }; struct ib_wq_init_attr { void *wq_context; enum ib_wq_type wq_type; u32 max_wr; u32 max_sge; struct ib_cq *cq; void (*event_handler)(struct ib_event *, void *); u32 create_flags; /* Use enum ib_wq_flags */ }; enum ib_wq_attr_mask { IB_WQ_STATE = 1 << 0, IB_WQ_CUR_STATE = 1 << 1, IB_WQ_FLAGS = 1 << 2, }; struct ib_wq_attr { enum ib_wq_state wq_state; enum ib_wq_state curr_wq_state; u32 flags; /* Use enum ib_wq_flags */ u32 flags_mask; /* Use enum ib_wq_flags */ }; struct ib_rwq_ind_table { struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; u32 ind_tbl_num; u32 log_ind_tbl_size; struct ib_wq **ind_tbl; }; struct ib_rwq_ind_table_init_attr { u32 log_ind_tbl_size; /* Each entry is a pointer to Receive Work Queue */ struct ib_wq **ind_tbl; }; enum port_pkey_state { IB_PORT_PKEY_NOT_VALID = 0, IB_PORT_PKEY_VALID = 1, IB_PORT_PKEY_LISTED = 2, }; struct ib_qp_security; struct ib_port_pkey { enum port_pkey_state state; u16 pkey_index; u32 port_num; struct list_head qp_list; struct list_head to_error_list; struct ib_qp_security *sec; }; struct ib_ports_pkeys { struct ib_port_pkey main; struct ib_port_pkey alt; }; struct ib_qp_security { struct ib_qp *qp; struct ib_device *dev; /* Hold this mutex when changing port and pkey settings. */ struct mutex mutex; struct ib_ports_pkeys *ports_pkeys; /* A list of all open shared QP handles. Required to enforce security * properly for all users of a shared QP. */ struct list_head shared_qp_list; void *security; bool destroying; atomic_t error_list_count; struct completion error_complete; int error_comps_pending; }; /* * @max_write_sge: Maximum SGE elements per RDMA WRITE request. * @max_read_sge: Maximum SGE elements per RDMA READ request. */ struct ib_qp { struct ib_device *device; struct ib_pd *pd; struct ib_cq *send_cq; struct ib_cq *recv_cq; spinlock_t mr_lock; int mrs_used; struct list_head rdma_mrs; struct list_head sig_mrs; struct ib_srq *srq; struct completion srq_completion; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; /* count times opened, mcast attaches, flow attaches */ atomic_t usecnt; struct list_head open_list; struct ib_qp *real_qp; struct ib_uqp_object *uobject; void (*event_handler)(struct ib_event *, void *); void (*registered_event_handler)(struct ib_event *, void *); void *qp_context; /* sgid_attrs associated with the AV's */ const struct ib_gid_attr *av_sgid_attr; const struct ib_gid_attr *alt_path_sgid_attr; u32 qp_num; u32 max_write_sge; u32 max_read_sge; enum ib_qp_type qp_type; struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_qp_security *qp_sec; u32 port; bool integrity_en; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; /* The counter the qp is bind to */ struct rdma_counter *counter; }; struct ib_dm { struct ib_device *device; u32 length; u32 flags; struct ib_uobject *uobject; atomic_t usecnt; }; /* bit values to mark existence of ib_dmah fields */ enum { IB_DMAH_CPU_ID_EXISTS, IB_DMAH_MEM_TYPE_EXISTS, IB_DMAH_PH_EXISTS, }; struct ib_dmah { struct ib_device *device; struct ib_uobject *uobject; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; u32 cpu_id; enum tph_mem_type mem_type; atomic_t usecnt; u8 ph; u8 valid_fields; /* use IB_DMAH_XXX_EXISTS */ }; struct ib_mr { struct ib_device *device; struct ib_pd *pd; u32 lkey; u32 rkey; u64 iova; u64 length; unsigned int page_size; enum ib_mr_type type; bool need_inval; union { struct ib_uobject *uobject; /* user */ struct list_head qp_entry; /* FR */ }; struct ib_dm *dm; struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */ struct ib_dmah *dmah; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; struct ib_mw { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; u32 rkey; enum ib_mw_type type; }; /* Supported steering options */ enum ib_flow_attr_type { /* steering according to rule specifications */ IB_FLOW_ATTR_NORMAL = 0x0, /* default unicast and multicast rule - * receive all Eth traffic which isn't steered to any QP */ IB_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default multicast rule - * receive all Eth multicast traffic which isn't steered to any QP */ IB_FLOW_ATTR_MC_DEFAULT = 0x2, /* sniffer rule - receive all port traffic */ IB_FLOW_ATTR_SNIFFER = 0x3 }; /* Supported steering header types */ enum ib_flow_spec_type { /* L2 headers*/ IB_FLOW_SPEC_ETH = 0x20, IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, IB_FLOW_SPEC_IPV6 = 0x31, IB_FLOW_SPEC_ESP = 0x34, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41, IB_FLOW_SPEC_VXLAN_TUNNEL = 0x50, IB_FLOW_SPEC_GRE = 0x51, IB_FLOW_SPEC_MPLS = 0x60, IB_FLOW_SPEC_INNER = 0x100, /* Actions */ IB_FLOW_SPEC_ACTION_TAG = 0x1000, IB_FLOW_SPEC_ACTION_DROP = 0x1001, IB_FLOW_SPEC_ACTION_HANDLE = 0x1002, IB_FLOW_SPEC_ACTION_COUNT = 0x1003, }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 10 enum ib_flow_flags { IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ IB_FLOW_ATTR_FLAGS_EGRESS = 1UL << 2, /* Egress flow */ IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 3 /* Must be last */ }; struct ib_flow_eth_filter { u8 dst_mac[6]; u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; }; struct ib_flow_spec_eth { u32 type; u16 size; struct ib_flow_eth_filter val; struct ib_flow_eth_filter mask; }; struct ib_flow_ib_filter { __be16 dlid; __u8 sl; }; struct ib_flow_spec_ib { u32 type; u16 size; struct ib_flow_ib_filter val; struct ib_flow_ib_filter mask; }; /* IPv4 header flags */ enum ib_ipv4_flags { IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */ IB_IPV4_MORE_FRAG = 0X4 /* For All fragmented packets except the last have this flag set */ }; struct ib_flow_ipv4_filter { __be32 src_ip; __be32 dst_ip; u8 proto; u8 tos; u8 ttl; u8 flags; }; struct ib_flow_spec_ipv4 { u32 type; u16 size; struct ib_flow_ipv4_filter val; struct ib_flow_ipv4_filter mask; }; struct ib_flow_ipv6_filter { u8 src_ip[16]; u8 dst_ip[16]; __be32 flow_label; u8 next_hdr; u8 traffic_class; u8 hop_limit; } __packed; struct ib_flow_spec_ipv6 { u32 type; u16 size; struct ib_flow_ipv6_filter val; struct ib_flow_ipv6_filter mask; }; struct ib_flow_tcp_udp_filter { __be16 dst_port; __be16 src_port; }; struct ib_flow_spec_tcp_udp { u32 type; u16 size; struct ib_flow_tcp_udp_filter val; struct ib_flow_tcp_udp_filter mask; }; struct ib_flow_tunnel_filter { __be32 tunnel_id; }; /* ib_flow_spec_tunnel describes the Vxlan tunnel * the tunnel_id from val has the vni value */ struct ib_flow_spec_tunnel { u32 type; u16 size; struct ib_flow_tunnel_filter val; struct ib_flow_tunnel_filter mask; }; struct ib_flow_esp_filter { __be32 spi; __be32 seq; }; struct ib_flow_spec_esp { u32 type; u16 size; struct ib_flow_esp_filter val; struct ib_flow_esp_filter mask; }; struct ib_flow_gre_filter { __be16 c_ks_res0_ver; __be16 protocol; __be32 key; }; struct ib_flow_spec_gre { u32 type; u16 size; struct ib_flow_gre_filter val; struct ib_flow_gre_filter mask; }; struct ib_flow_mpls_filter { __be32 tag; }; struct ib_flow_spec_mpls { u32 type; u16 size; struct ib_flow_mpls_filter val; struct ib_flow_mpls_filter mask; }; struct ib_flow_spec_action_tag { enum ib_flow_spec_type type; u16 size; u32 tag_id; }; struct ib_flow_spec_action_drop { enum ib_flow_spec_type type; u16 size; }; struct ib_flow_spec_action_handle { enum ib_flow_spec_type type; u16 size; struct ib_flow_action *act; }; enum ib_counters_description { IB_COUNTER_PACKETS, IB_COUNTER_BYTES, }; struct ib_flow_spec_action_count { enum ib_flow_spec_type type; u16 size; struct ib_counters *counters; }; union ib_flow_spec { struct { u32 type; u16 size; }; struct ib_flow_spec_eth eth; struct ib_flow_spec_ib ib; struct ib_flow_spec_ipv4 ipv4; struct ib_flow_spec_tcp_udp tcp_udp; struct ib_flow_spec_ipv6 ipv6; struct ib_flow_spec_tunnel tunnel; struct ib_flow_spec_esp esp; struct ib_flow_spec_gre gre; struct ib_flow_spec_mpls mpls; struct ib_flow_spec_action_tag flow_tag; struct ib_flow_spec_action_drop drop; struct ib_flow_spec_action_handle action; struct ib_flow_spec_action_count flow_count; }; struct ib_flow_attr { enum ib_flow_attr_type type; u16 size; u16 priority; u32 flags; u8 num_of_specs; u32 port; union ib_flow_spec flows[]; }; struct ib_flow { struct ib_qp *qp; struct ib_device *device; struct ib_uobject *uobject; }; enum ib_flow_action_type { IB_FLOW_ACTION_UNSPECIFIED, IB_FLOW_ACTION_ESP = 1, }; struct ib_flow_action_attrs_esp_keymats { enum ib_uverbs_flow_action_esp_keymat protocol; union { struct ib_uverbs_flow_action_esp_keymat_aes_gcm aes_gcm; } keymat; }; struct ib_flow_action_attrs_esp_replays { enum ib_uverbs_flow_action_esp_replay protocol; union { struct ib_uverbs_flow_action_esp_replay_bmp bmp; } replay; }; enum ib_flow_action_attrs_esp_flags { /* All user-space flags at the top: Use enum ib_uverbs_flow_action_esp_flags * This is done in order to share the same flags between user-space and * kernel and spare an unnecessary translation. */ /* Kernel flags */ IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED = 1ULL << 32, IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS = 1ULL << 33, }; struct ib_flow_spec_list { struct ib_flow_spec_list *next; union ib_flow_spec spec; }; struct ib_flow_action_attrs_esp { struct ib_flow_action_attrs_esp_keymats *keymat; struct ib_flow_action_attrs_esp_replays *replay; struct ib_flow_spec_list *encap; /* Used only if IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED is enabled. * Value of 0 is a valid value. */ u32 esn; u32 spi; u32 seq; u32 tfc_pad; /* Use enum ib_flow_action_attrs_esp_flags */ u64 flags; u64 hard_limit_pkts; }; struct ib_flow_action { struct ib_device *device; struct ib_uobject *uobject; enum ib_flow_action_type type; atomic_t usecnt; }; struct ib_mad; enum ib_process_mad_flags { IB_MAD_IGNORE_MKEY = 1, IB_MAD_IGNORE_BKEY = 2, IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY }; enum ib_mad_result { IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ }; struct ib_port_cache { u64 subnet_prefix; struct ib_pkey_cache *pkey; struct ib_gid_table *gid; u8 lmc; enum ib_port_state port_state; enum ib_port_state last_port_state; }; struct ib_port_immutable { int pkey_tbl_len; int gid_tbl_len; u32 core_cap_flags; u32 max_mad_size; }; struct ib_port_data { struct ib_device *ib_dev; struct ib_port_immutable immutable; spinlock_t pkey_list_lock; spinlock_t netdev_lock; struct list_head pkey_list; struct ib_port_cache cache; struct net_device __rcu *netdev; netdevice_tracker netdev_tracker; struct hlist_node ndev_hash_link; struct rdma_port_counter port_counter; struct ib_port *sysfs; }; /* rdma netdev type - specifies protocol type */ enum rdma_netdev_t { RDMA_NETDEV_OPA_VNIC, RDMA_NETDEV_IPOIB, }; /** * struct rdma_netdev - rdma netdev * For cases where netstack interfacing is required. */ struct rdma_netdev { void *clnt_priv; struct ib_device *hca; u32 port_num; int mtu; /* * cleanup function must be specified. * FIXME: This is only used for OPA_VNIC and that usage should be * removed too. */ void (*free_rdma_netdev)(struct net_device *netdev); /* control functions */ void (*set_id)(struct net_device *netdev, int id); /* send packet */ int (*send)(struct net_device *dev, struct sk_buff *skb, struct ib_ah *address, u32 dqpn); /* multicast */ int (*attach_mcast)(struct net_device *dev, struct ib_device *hca, union ib_gid *gid, u16 mlid, int set_qkey, u32 qkey); int (*detach_mcast)(struct net_device *dev, struct ib_device *hca, union ib_gid *gid, u16 mlid); /* timeout */ void (*tx_timeout)(struct net_device *dev, unsigned int txqueue); }; struct rdma_netdev_alloc_params { size_t sizeof_priv; unsigned int txqs; unsigned int rxqs; void *param; int (*initialize_rdma_netdev)(struct ib_device *device, u32 port_num, struct net_device *netdev, void *param); }; struct ib_odp_counters { atomic64_t faults; atomic64_t faults_handled; atomic64_t invalidations; atomic64_t invalidations_handled; atomic64_t prefetch; }; struct ib_counters { struct ib_device *device; struct ib_uobject *uobject; /* num of objects attached */ atomic_t usecnt; }; struct ib_counters_read_attr { u64 *counters_buff; u32 ncounters; u32 flags; /* use enum ib_read_counters_flags */ }; struct uverbs_attr_bundle; struct iw_cm_id; struct iw_cm_conn_param; #define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \ .size_##ib_struct = \ (sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \ BUILD_BUG_ON_ZERO( \ !__same_type(((struct drv_struct *)NULL)->member, \ struct ib_struct))) #define rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, gfp) \ ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \ gfp, false)) #define rdma_zalloc_drv_obj_numa(ib_dev, ib_type) \ ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \ GFP_KERNEL, true)) #define rdma_zalloc_drv_obj(ib_dev, ib_type) \ rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, GFP_KERNEL) #define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct struct rdma_user_mmap_entry { struct kref ref; struct ib_ucontext *ucontext; unsigned long start_pgoff; size_t npages; bool driver_removed; }; /* Return the offset (in bytes) the user should pass to libc's mmap() */ static inline u64 rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry) { return (u64)entry->start_pgoff << PAGE_SHIFT; } /** * struct ib_device_ops - InfiniBand device operations * This structure defines all the InfiniBand device operations, providers will * need to define the supported operations, otherwise they will be set to null. */ struct ib_device_ops { struct module *owner; enum rdma_driver_id driver_id; u32 uverbs_abi_ver; unsigned int uverbs_no_driver_id_binding:1; /* * NOTE: New drivers should not make use of device_group; instead new * device parameter should be exposed via netlink command. This * mechanism exists only for existing drivers. */ const struct attribute_group *device_group; const struct attribute_group **port_groups; int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); void (*drain_rq)(struct ib_qp *qp); void (*drain_sq)(struct ib_qp *qp); int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int (*peek_cq)(struct ib_cq *cq, int wc_cnt); int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags); int (*post_srq_recv)(struct ib_srq *srq, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int (*process_mad)(struct ib_device *device, int process_mad_flags, u32 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad *in_mad, struct ib_mad *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index); int (*query_device)(struct ib_device *device, struct ib_device_attr *device_attr, struct ib_udata *udata); int (*modify_device)(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); void (*get_dev_fw_str)(struct ib_device *device, char *str); const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, int comp_vector); int (*query_port)(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); /* * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this * structure to avoid cache line misses when accessing struct ib_device * in fast paths. */ int (*get_port_immutable)(struct ib_device *device, u32 port_num, struct ib_port_immutable *immutable); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u32 port_num); /* * When calling get_netdev, the HW vendor's driver should return the * net device of device @device at port @port_num or NULL if such * a net device doesn't exist. The vendor driver should call dev_hold * on this net device. The HW vendor's device driver must guarantee * that this function returns NULL before the net device has finished * NETDEV_UNREGISTER state. */ struct net_device *(*get_netdev)(struct ib_device *device, u32 port_num); /* * rdma netdev operation * * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params * must return -EOPNOTSUPP if it doesn't support the specified type. */ struct net_device *(*alloc_rdma_netdev)( struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)); int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params); /* * query_gid should be return GID value for @device, when @port_num * link layer is either IB or iWarp. It is no-op if @port_num port * is RoCE link layer. */ int (*query_gid)(struct ib_device *device, u32 port_num, int index, union ib_gid *gid); /* * When calling add_gid, the HW vendor's driver should add the gid * of device of port at gid index available at @attr. Meta-info of * that gid (for example, the network device related to this gid) is * available at @attr. @context allows the HW vendor driver to store * extra information together with a GID entry. The HW vendor driver may * allocate memory to contain this information and store it in @context * when a new GID entry is written to. Params are consistent until the * next call of add_gid or delete_gid. The function should return 0 on * success or error otherwise. The function could be called * concurrently for different ports. This function is only called when * roce_gid_table is used. */ int (*add_gid)(const struct ib_gid_attr *attr, void **context); /* * When calling del_gid, the HW vendor's driver should delete the * gid of device @device at gid index gid_index of port port_num * available in @attr. * Upon the deletion of a GID entry, the HW vendor must free any * allocated memory. The caller will clear @context afterwards. * This function is only called when roce_gid_table is used. */ int (*del_gid)(const struct ib_gid_attr *attr, void **context); int (*query_pkey)(struct ib_device *device, u32 port_num, u16 index, u16 *pkey); int (*alloc_ucontext)(struct ib_ucontext *context, struct ib_udata *udata); void (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); /* * This will be called once refcount of an entry in mmap_xa reaches * zero. The type of the memory that was mapped may differ between * entries and is opaque to the rdma_user_mmap interface. * Therefore needs to be implemented by the driver in mmap_free. */ void (*mmap_free)(struct rdma_user_mmap_entry *entry); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*create_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr, struct ib_udata *udata); int (*create_user_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr, struct ib_udata *udata); int (*modify_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int (*destroy_ah)(struct ib_ah *ah, u32 flags); int (*create_srq)(struct ib_srq *srq, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata); int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int (*destroy_srq)(struct ib_srq *srq, struct ib_udata *udata); int (*create_qp)(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); int (*modify_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata); int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata); int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); int (*create_cq_umem)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, struct ib_umem *umem, struct uverbs_attr_bundle *attrs); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); /* * pre_destroy_cq - Prevent a cq from generating any new work * completions, but not free any kernel resources */ int (*pre_destroy_cq)(struct ib_cq *cq); /* * post_destroy_cq - Free all kernel resources */ void (*post_destroy_cq)(struct ib_cq *cq); struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_dmah *dmah, struct ib_udata *udata); struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset, u64 length, u64 virt_addr, int fd, int mr_access_flags, struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata); struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); struct ib_mr *(*alloc_mr_integrity)(struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg); int (*advise_mr)(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge, struct uverbs_attr_bundle *attrs); /* * Kernel users should universally support relaxed ordering (RO), as * they are designed to read data only after observing the CQE and use * the DMA API correctly. * * Some drivers implicitly enable RO if platform supports it. */ int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); int (*alloc_mw)(struct ib_mw *mw, struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*alloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata); int (*dealloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata); struct ib_flow *(*create_flow)(struct ib_qp *qp, struct ib_flow_attr *flow_attr, struct ib_udata *udata); int (*destroy_flow)(struct ib_flow *flow_id); int (*destroy_flow_action)(struct ib_flow_action *action); int (*set_vf_link_state)(struct ib_device *device, int vf, u32 port, int state); int (*get_vf_config)(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *ivf); int (*get_vf_stats)(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats); int (*get_vf_guid)(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int (*set_vf_guid)(struct ib_device *device, int vf, u32 port, u64 guid, int type); struct ib_wq *(*create_wq)(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); int (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata); int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr, u32 wq_attr_mask, struct ib_udata *udata); int (*create_rwq_ind_table)(struct ib_rwq_ind_table *ib_rwq_ind_table, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); struct ib_dm *(*alloc_dm)(struct ib_device *device, struct ib_ucontext *context, struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); int (*dealloc_dm)(struct ib_dm *dm, struct uverbs_attr_bundle *attrs); int (*alloc_dmah)(struct ib_dmah *ibdmah, struct uverbs_attr_bundle *attrs); int (*dealloc_dmah)(struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); int (*create_counters)(struct ib_counters *counters, struct uverbs_attr_bundle *attrs); int (*destroy_counters)(struct ib_counters *counters); int (*read_counters)(struct ib_counters *counters, struct ib_counters_read_attr *counters_read_attr, struct uverbs_attr_bundle *attrs); int (*map_mr_sg_pi)(struct ib_mr *mr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset); /* * alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and * fill in the driver initialized data. The struct is kfree()'ed by * the sysfs core when the device is removed. A lifespan of -1 in the * return struct tells the core to set a default lifespan. */ struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device); struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device, u32 port_num); /* * get_hw_stats - Fill in the counter value(s) in the stats struct. * @index - The index in the value array we wish to have updated, or * num_counters if we want all stats updated * Return codes - * < 0 - Error, no counters updated * index - Updated the single counter pointed to by index * num_counters - Updated all counters (will reset the timestamp * and prevent further calls for lifespan milliseconds) * Drivers are allowed to update all counters in leiu of just the * one given in index at their option */ int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u32 port, int index); /* * modify_hw_stat - Modify the counter configuration * @enable: true/false when enable/disable a counter * Return codes - 0 on success or error code otherwise. */ int (*modify_hw_stat)(struct ib_device *device, u32 port, unsigned int counter_index, bool enable); /* * Allows rdma drivers to add their own restrack attributes. */ int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); int (*fill_res_mr_entry_raw)(struct sk_buff *msg, struct ib_mr *ibmr); int (*fill_res_cq_entry)(struct sk_buff *msg, struct ib_cq *ibcq); int (*fill_res_cq_entry_raw)(struct sk_buff *msg, struct ib_cq *ibcq); int (*fill_res_qp_entry)(struct sk_buff *msg, struct ib_qp *ibqp); int (*fill_res_qp_entry_raw)(struct sk_buff *msg, struct ib_qp *ibqp); int (*fill_res_cm_id_entry)(struct sk_buff *msg, struct rdma_cm_id *id); int (*fill_res_srq_entry)(struct sk_buff *msg, struct ib_srq *ib_srq); int (*fill_res_srq_entry_raw)(struct sk_buff *msg, struct ib_srq *ib_srq); /* Device lifecycle callbacks */ /* * Called after the device becomes registered, before clients are * attached */ int (*enable_driver)(struct ib_device *dev); /* * This is called as part of ib_dealloc_device(). */ void (*dealloc_driver)(struct ib_device *dev); /* iWarp CM callbacks */ void (*iw_add_ref)(struct ib_qp *qp); void (*iw_rem_ref)(struct ib_qp *qp); struct ib_qp *(*iw_get_qp)(struct ib_device *device, int qpn); int (*iw_connect)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*iw_accept)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*iw_reject)(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog); int (*iw_destroy_listen)(struct iw_cm_id *cm_id); /* * counter_bind_qp - Bind a QP to a counter. * @counter - The counter to be bound. If counter->id is zero then * the driver needs to allocate a new counter and set counter->id */ int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp, u32 port); /* * counter_unbind_qp - Unbind the qp from the dynamically-allocated * counter and bind it onto the default one */ int (*counter_unbind_qp)(struct ib_qp *qp, u32 port); /* * counter_dealloc -De-allocate the hw counter */ int (*counter_dealloc)(struct rdma_counter *counter); /* * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in * the driver initialized data. */ struct rdma_hw_stats *(*counter_alloc_stats)( struct rdma_counter *counter); /* * counter_update_stats - Query the stats value of this counter */ int (*counter_update_stats)(struct rdma_counter *counter); /* * counter_init - Initialize the driver specific rdma counter struct. */ void (*counter_init)(struct rdma_counter *counter); /* * Allows rdma drivers to add their own restrack attributes * dumped via 'rdma stat' iproute2 command. */ int (*fill_stat_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); /* query driver for its ucontext properties */ int (*query_ucontext)(struct ib_ucontext *context, struct uverbs_attr_bundle *attrs); /* * Provide NUMA node. This API exists for rdmavt/hfi1 only. * Everyone else relies on Linux memory management model. */ int (*get_numa_node)(struct ib_device *dev); /* * add_sub_dev - Add a sub IB device */ struct ib_device *(*add_sub_dev)(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name); /* * del_sub_dev - Delete a sub IB device */ void (*del_sub_dev)(struct ib_device *sub_dev); /* * ufile_cleanup - Attempt to cleanup ubojects HW resources inside * the ufile. */ void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile); /* * report_port_event - Drivers need to implement this if they have * some private stuff to handle when link status changes. */ void (*report_port_event)(struct ib_device *ibdev, struct net_device *ndev, unsigned long event); DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_counters); DECLARE_RDMA_OBJ_SIZE(ib_cq); DECLARE_RDMA_OBJ_SIZE(ib_dmah); DECLARE_RDMA_OBJ_SIZE(ib_mw); DECLARE_RDMA_OBJ_SIZE(ib_pd); DECLARE_RDMA_OBJ_SIZE(ib_qp); DECLARE_RDMA_OBJ_SIZE(ib_rwq_ind_table); DECLARE_RDMA_OBJ_SIZE(ib_srq); DECLARE_RDMA_OBJ_SIZE(ib_ucontext); DECLARE_RDMA_OBJ_SIZE(ib_xrcd); DECLARE_RDMA_OBJ_SIZE(rdma_counter); }; struct ib_core_device { /* device must be the first element in structure until, * union of ib_core_device and device exists in ib_device. */ struct device dev; possible_net_t rdma_net; struct kobject *ports_kobj; struct list_head port_list; struct ib_device *owner; /* reach back to owner ib_device */ }; struct rdma_restrack_root; struct ib_device { /* Do not access @dma_device directly from ULP nor from HW drivers. */ struct device *dma_device; struct ib_device_ops ops; char name[IB_DEVICE_NAME_MAX]; struct rcu_head rcu_head; struct list_head event_handler_list; /* Protects event_handler_list */ struct rw_semaphore event_handler_rwsem; /* Protects QP's event_handler calls and open_qp list */ spinlock_t qp_open_list_lock; struct rw_semaphore client_data_rwsem; struct xarray client_data; struct mutex unregistration_lock; /* Synchronize GID, Pkey cache entries, subnet prefix, LMC */ rwlock_t cache_lock; /** * port_data is indexed by port number */ struct ib_port_data *port_data; int num_comp_vectors; union { struct device dev; struct ib_core_device coredev; }; /* First group is for device attributes, * Second group is for driver provided attributes (optional). * Third group is for the hw_stats * It is a NULL terminated array. */ const struct attribute_group *groups[4]; u8 hw_stats_attr_index; u64 uverbs_cmd_mask; char node_desc[IB_DEVICE_NODE_DESC_MAX]; __be64 node_guid; u32 local_dma_lkey; u16 is_switch:1; /* Indicates kernel verbs support, should not be used in drivers */ u16 kverbs_provider:1; /* CQ adaptive moderation (RDMA DIM) */ u16 use_cq_dim:1; u8 node_type; u32 phys_port_cnt; struct ib_device_attr attrs; struct hw_stats_device_data *hw_stats_data; #ifdef CONFIG_CGROUP_RDMA struct rdmacg_device cg_device; #endif u32 index; spinlock_t cq_pools_lock; struct list_head cq_pools[IB_POLL_LAST_POOL_TYPE + 1]; struct rdma_restrack_root *res; const struct uapi_definition *driver_def; /* * Positive refcount indicates that the device is currently * registered and cannot be unregistered. */ refcount_t refcount; struct completion unreg_completion; struct work_struct unregistration_work; const struct rdma_link_ops *link_ops; /* Protects compat_devs xarray modifications */ struct mutex compat_devs_mutex; /* Maintains compat devices for each net namespace */ struct xarray compat_devs; /* Used by iWarp CM */ char iw_ifname[IFNAMSIZ]; u32 iw_driver_flags; u32 lag_flags; /* A parent device has a list of sub-devices */ struct mutex subdev_lock; struct list_head subdev_list_head; /* A sub device has a type and a parent */ enum rdma_nl_dev_type type; struct ib_device *parent; struct list_head subdev_list; enum rdma_nl_name_assign_type name_assign_type; }; static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size, gfp_t gfp, bool is_numa_aware) { if (is_numa_aware && dev->ops.get_numa_node) return kzalloc_node(size, gfp, dev->ops.get_numa_node(dev)); return kzalloc(size, gfp); } struct ib_client_nl_info; struct ib_client { const char *name; int (*add)(struct ib_device *ibdev); void (*remove)(struct ib_device *, void *client_data); void (*rename)(struct ib_device *dev, void *client_data); int (*get_nl_info)(struct ib_device *ibdev, void *client_data, struct ib_client_nl_info *res); int (*get_global_nl_info)(struct ib_client_nl_info *res); /* Returns the net_dev belonging to this ib_client and matching the * given parameters. * @dev: An RDMA device that the net_dev use for communication. * @port: A physical port number on the RDMA device. * @pkey: P_Key that the net_dev uses if applicable. * @gid: A GID that the net_dev uses to communicate. * @addr: An IP address the net_dev is configured with. * @client_data: The device's client data set by ib_set_client_data(). * * An ib_client that implements a net_dev on top of RDMA devices * (such as IP over IB) should implement this callback, allowing the * rdma_cm module to find the right net_dev for a given request. * * The caller is responsible for calling dev_put on the returned * netdev. */ struct net_device *(*get_net_dev_by_params)( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); refcount_t uses; struct completion uses_zero; u32 client_id; /* kverbs are not required by the client */ u8 no_kverbs_req:1; }; /* * IB block DMA iterator * * Iterates the DMA-mapped SGL in contiguous memory blocks aligned * to a HW supported page size. */ struct ib_block_iter { /* internal states */ struct scatterlist *__sg; /* sg holding the current aligned block */ dma_addr_t __dma_addr; /* unaligned DMA address of this block */ size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */ unsigned int __sg_nents; /* number of SG entries */ unsigned int __sg_advance; /* number of bytes to advance in sg in next step */ unsigned int __pg_bit; /* alignment of current block */ }; struct ib_device *_ib_alloc_device(size_t size, struct net *net); #define ib_alloc_device(drv_struct, member) \ container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ struct drv_struct, member)), \ &init_net), \ struct drv_struct, member) #define ib_alloc_device_with_net(drv_struct, member, net) \ container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ struct drv_struct, member)), net), \ struct drv_struct, member) void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device); void ib_unregister_device(struct ib_device *device); void ib_unregister_driver(enum rdma_driver_id driver_id); void ib_unregister_device_and_put(struct ib_device *device); void ib_unregister_device_queued(struct ib_device *ib_dev); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); void __rdma_block_iter_start(struct ib_block_iter *biter, struct scatterlist *sglist, unsigned int nents, unsigned long pgsz); bool __rdma_block_iter_next(struct ib_block_iter *biter); /** * rdma_block_iter_dma_address - get the aligned dma address of the current * block held by the block iterator. * @biter: block iterator holding the memory block */ static inline dma_addr_t rdma_block_iter_dma_address(struct ib_block_iter *biter) { return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1); } /** * rdma_for_each_block - iterate over contiguous memory blocks of the sg list * @sglist: sglist to iterate over * @biter: block iterator holding the memory block * @nents: maximum number of sg entries to iterate over * @pgsz: best HW supported page size to use * * Callers may use rdma_block_iter_dma_address() to get each * blocks aligned DMA address. */ #define rdma_for_each_block(sglist, biter, nents, pgsz) \ for (__rdma_block_iter_start(biter, sglist, nents, \ pgsz); \ __rdma_block_iter_next(biter);) /** * ib_get_client_data - Get IB client context * @device:Device to get context for * @client:Client to get context for * * ib_get_client_data() returns the client context data set with * ib_set_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ static inline void *ib_get_client_data(struct ib_device *device, struct ib_client *client) { return xa_load(&device->client_data, client->client_id); } void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); void ib_set_device_ops(struct ib_device *device, const struct ib_device_ops *ops); int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, unsigned long pfn, unsigned long size, pgprot_t prot, struct rdma_user_mmap_entry *entry); int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length); int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length, u32 min_pgoff, u32 max_pgoff); #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) void rdma_user_mmap_disassociate(struct ib_device *device); #else static inline void rdma_user_mmap_disassociate(struct ib_device *device) { } #endif static inline int rdma_user_mmap_entry_insert_exact(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length, u32 pgoff) { return rdma_user_mmap_entry_insert_range(ucontext, entry, length, pgoff, pgoff); } struct rdma_user_mmap_entry * rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext, unsigned long pgoff); struct rdma_user_mmap_entry * rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, struct vm_area_struct *vma); void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry); void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry); static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; } static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } static inline bool ib_is_buffer_cleared(const void __user *p, size_t len) { bool ret; u8 *buf; if (len > USHRT_MAX) return false; buf = memdup_user(p, len); if (IS_ERR(buf)) return false; ret = !memchr_inv(buf, 0, len); kfree(buf); return ret; } static inline bool ib_is_udata_cleared(struct ib_udata *udata, size_t offset, size_t len) { return ib_is_buffer_cleared(udata->inbuf + offset, len); } /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for * the given QP state transition. * @cur_state: Current QP state * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It * checks that cur_state and next_state are valid QP states, that a * transition from cur_state to next_state is allowed by the IB spec, * and that the attribute mask supplied is allowed for the transition. */ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask); void ib_register_event_handler(struct ib_event_handler *event_handler); void ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(const struct ib_event *event); int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u32 port_num); /** * rdma_cap_ib_switch - Check if the device is IB switch * @device: Device to check * * Device driver is responsible for setting is_switch bit on * in ib_device structure at init time. * * Return: true if the device is IB switch. */ static inline bool rdma_cap_ib_switch(const struct ib_device *device) { return device->is_switch; } /** * rdma_start_port - Return the first valid port number for the device * specified * * @device: Device to be checked * * Return start port number */ static inline u32 rdma_start_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : 1; } /** * rdma_for_each_port - Iterate over all valid port numbers of the IB device * @device: The struct ib_device * to iterate over * @iter: The unsigned int to store the port number */ #define rdma_for_each_port(device, iter) \ for (iter = rdma_start_port(device + \ BUILD_BUG_ON_ZERO(!__same_type(u32, \ iter))); \ iter <= rdma_end_port(device); iter++) /** * rdma_end_port - Return the last valid port number for the device * specified * * @device: Device to be checked * * Return last port number */ static inline u32 rdma_end_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; } static inline int rdma_is_port_valid(const struct ib_device *device, unsigned int port) { return (port >= rdma_start_port(device) && port <= rdma_end_port(device)); } static inline bool rdma_is_grh_required(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_PORT_IB_GRH_REQUIRED; } static inline bool rdma_protocol_ib(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IB; } static inline bool rdma_protocol_roce(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); } static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; } static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; } static inline bool rdma_protocol_iwarp(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; } static inline bool rdma_ib_or_roce(const struct ib_device *device, u32 port_num) { return rdma_protocol_ib(device, port_num) || rdma_protocol_roce(device, port_num); } static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_RAW_PACKET; } static inline bool rdma_protocol_usnic(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_USNIC; } /** * rdma_cap_ib_mad - Check if the port of a device supports Infiniband * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Management Datagrams (MAD) are a required part of the InfiniBand * specification and are supported on all InfiniBand devices. A slightly * extended version are also supported on OPA interfaces. * * Return: true if the port supports sending/receiving of MAD packets. */ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_MAD; } /** * rdma_cap_opa_mad - Check if the port of device provides support for OPA * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Intel OmniPath devices extend and/or replace the InfiniBand Management * datagrams with their own versions. These OPA MADs share many but not all of * the characteristics of InfiniBand MADs. * * OPA MADs differ in the following ways: * * 1) MADs are variable size up to 2K * IBTA defined MADs remain fixed at 256 bytes * 2) OPA SMPs must carry valid PKeys * 3) OPA SMP packets are a different format * * Return: true if the port supports OPA MAD packet formats. */ static inline bool rdma_cap_opa_mad(struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_MAD; } /** * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI). * @device: Device to check * @port_num: Port number to check * * Each InfiniBand node is required to provide a Subnet Management Agent * that the subnet manager can access. Prior to the fabric being fully * configured by the subnet manager, the SMA is accessed via a well known * interface called the Subnet Management Interface (SMI). This interface * uses directed route packets to communicate with the SM to get around the * chicken and egg problem of the SM needing to know what's on the fabric * in order to configure the fabric, and needing to configure the fabric in * order to send packets to the devices on the fabric. These directed * route packets do not need the fabric fully configured in order to reach * their destination. The SMI is the only method allowed to send * directed route packets on an InfiniBand fabric. * * Return: true if the port provides an SMI. */ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_SMI; } /** * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband * Communication Manager. * @device: Device to check * @port_num: Port number to check * * The InfiniBand Communication Manager is one of many pre-defined General * Service Agents (GSA) that are accessed via the General Service * Interface (GSI). It's role is to facilitate establishment of connections * between nodes as well as other management related tasks for established * connections. * * Return: true if the port supports an IB CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_CM; } /** * rdma_cap_iw_cm - Check if the port of device has the capability IWARP * Communication Manager. * @device: Device to check * @port_num: Port number to check * * Similar to above, but specific to iWARP connections which have a different * managment protocol than InfiniBand. * * Return: true if the port supports an iWARP CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IW_CM; } /** * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband * Subnet Administration. * @device: Device to check * @port_num: Port number to check * * An InfiniBand Subnet Administration (SA) service is a pre-defined General * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand * fabrics, devices should resolve routes to other hosts by contacting the * SA to query the proper route. * * Return: true if the port should act as a client to the fabric Subnet * Administration interface. This does not imply that the SA service is * running locally. */ static inline bool rdma_cap_ib_sa(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_SA; } /** * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband * Multicast. * @device: Device to check * @port_num: Port number to check * * InfiniBand multicast registration is more complex than normal IPv4 or * IPv6 multicast registration. Each Host Channel Adapter must register * with the Subnet Manager when it wishes to join a multicast group. It * should do so only once regardless of how many queue pairs it subscribes * to this group. And it should leave the group only after all queue pairs * attached to the group have been detached. * * Return: true if the port must undertake the additional adminstrative * overhead of registering/unregistering with the SM and tracking of the * total number of queue pairs attached to the multicast group. */ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u32 port_num) { return rdma_cap_ib_sa(device, port_num); } /** * rdma_cap_af_ib - Check if the port of device has the capability * Native Infiniband Address. * @device: Device to check * @port_num: Port number to check * * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default * GID. RoCE uses a different mechanism, but still generates a GID via * a prescribed mechanism and port specific data. * * Return: true if the port uses a GID address to identify devices on the * network. */ static inline bool rdma_cap_af_ib(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_AF_IB; } /** * rdma_cap_eth_ah - Check if the port of device has the capability * Ethernet Address Handle. * @device: Device to check * @port_num: Port number to check * * RoCE is InfiniBand over Ethernet, and it uses a well defined technique * to fabricate GIDs over Ethernet/IP specific addresses native to the * port. Normally, packet headers are generated by the sending host * adapter, but when sending connectionless datagrams, we must manually * inject the proper headers for the fabric we are communicating over. * * Return: true if we are running as a RoCE port and must force the * addition of a Global Route Header built from our Ethernet Address * Handle into our header list for connectionless packets. */ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_ETH_AH; } /** * rdma_cap_opa_ah - Check if the port of device supports * OPA Address handles * @device: Device to check * @port_num: Port number to check * * Return: true if we are running on an OPA device which supports * the extended OPA addressing. */ static inline bool rdma_cap_opa_ah(struct ib_device *device, u32 port_num) { return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH; } /** * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. * * @device: Device * @port_num: Port number * * This MAD size includes the MAD headers and MAD payload. No other headers * are included. * * Return the max MAD size required by the Port. Will return 0 if the port * does not support MADs */ static inline size_t rdma_max_mad_size(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.max_mad_size; } /** * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table * @device: Device to check * @port_num: Port number to check * * RoCE GID table mechanism manages the various GIDs for a device. * * NOTE: if allocating the port's GID table has failed, this call will still * return true, but any RoCE GID table API will fail. * * Return: true if the port uses RoCE GID table mechanism in order to manage * its GIDs. */ static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, u32 port_num) { return rdma_protocol_roce(device, port_num) && device->ops.add_gid && device->ops.del_gid; } /* * Check if the device supports READ W/ INVALIDATE. */ static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num) { /* * iWarp drivers must support READ W/ INVALIDATE. No other protocol * has support for it yet. */ return rdma_protocol_iwarp(dev, port_num); } /** * rdma_core_cap_opa_port - Return whether the RDMA Port is OPA or not. * @device: Device * @port_num: 1 based Port number * * Return true if port is an Intel OPA port , false if not */ static inline bool rdma_core_cap_opa_port(struct ib_device *device, u32 port_num) { return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_PORT_INTEL_OPA) == RDMA_CORE_PORT_INTEL_OPA; } /** * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value. * @device: Device * @port: Port number * @mtu: enum value of MTU * * Return the MTU size supported by the port as an integer value. Will return * -1 if enum value of mtu is not supported. */ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port, int mtu) { if (rdma_core_cap_opa_port(device, port)) return opa_mtu_enum_to_int((enum opa_mtu)mtu); else return ib_mtu_enum_to_int((enum ib_mtu)mtu); } /** * rdma_mtu_from_attr - Return the mtu of the port from the port attribute. * @device: Device * @port: Port number * @attr: port attribute * * Return the MTU size supported by the port as an integer value. */ static inline int rdma_mtu_from_attr(struct ib_device *device, u32 port, struct ib_port_attr *attr) { if (rdma_core_cap_opa_port(device, port)) return attr->phys_mtu; else return ib_mtu_enum_to_int(attr->max_mtu); } int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port, int state); int ib_get_vf_config(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *info); int ib_get_vf_stats(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats); int ib_get_vf_guid(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid, int type); int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey); int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index); enum ib_pd_flags { /* * Create a memory registration for all memory in the system and place * the rkey for it into pd->unsafe_global_rkey. This can be used by * ULPs to avoid the overhead of dynamic MRs. * * This flag is generally considered unsafe and must only be used in * extremly trusted environments. Every use of it will log a warning * in the kernel log. */ IB_PD_UNSAFE_GLOBAL_RKEY = 0x01, }; struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, const char *caller); /** * ib_alloc_pd - Allocates an unused protection domain. * @device: The device on which to allocate the protection domain. * @flags: protection domain flags * * A protection domain object provides an association between QPs, shared * receive queues, address handles, memory regions, and memory windows. * * Every PD has a local_dma_lkey which can be used as the lkey value for local * memory operations. */ #define ib_alloc_pd(device, flags) \ __ib_alloc_pd((device), (flags), KBUILD_MODNAME) int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata); /** * ib_dealloc_pd - Deallocate kernel PD * @pd: The protection domain * * NOTE: for user PD use ib_dealloc_pd_user with valid udata! */ static inline void ib_dealloc_pd(struct ib_pd *pd) { int ret = ib_dealloc_pd_user(pd, NULL); WARN_ONCE(ret, "Destroy of kernel PD shouldn't fail"); } enum rdma_create_ah_flags { /* In a sleepable context */ RDMA_CREATE_AH_SLEEPABLE = BIT(0), }; /** * rdma_create_ah - Creates an address handle for the given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @flags: Create address handle flags (see enum rdma_create_ah_flags). * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags); /** * rdma_create_user_ah - Creates an address handle for the given address vector. * It resolves destination mac address for ah attribute of RoCE type. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @udata: pointer to user's input output buffer information need by * provider driver. * * It returns 0 on success and returns appropriate error code on error. * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata); /** * ib_get_gids_from_rdma_hdr - Get sgid and dgid from GRH or IPv4 header * work completion. * @hdr: the L3 header to parse * @net_type: type of header to parse * @sgid: place to store source gid * @dgid: place to store destination gid */ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, enum rdma_network_type net_type, union ib_gid *sgid, union ib_gid *dgid); /** * ib_get_rdma_header_version - Get the header version * @hdr: the L3 header to parse */ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr); /** * ib_init_ah_attr_from_wc - Initializes address handle attributes from a * work completion. * @device: Device on which the received message arrived. * @port_num: Port on which the received message arrived. * @wc: Work completion associated with the received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. * When ib_init_ah_attr_from_wc() returns success, * (a) for IB link layer it optionally contains a reference to SGID attribute * when GRH is present for IB link layer. * (b) for RoCE link layer it contains a reference to SGID attribute. * User must invoke rdma_cleanup_ah_attr_gid_attr() to release reference to SGID * attributes which are initialized using ib_init_ah_attr_from_wc(). * */ int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the * sender of the specified work completion. * @pd: The protection domain associated with the address handle. * @wc: Work completion information associated with a received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @port_num: The outbound port number to associate with the address. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u32 port_num); /** * rdma_modify_ah - Modifies the address vector associated with an address * handle. * @ah: The address handle to modify. * @ah_attr: The new address vector attributes to associate with the * address handle. */ int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); /** * rdma_query_ah - Queries the address vector associated with an address * handle. * @ah: The address handle to query. * @ah_attr: The address vector attributes associated with the address * handle. */ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); enum rdma_destroy_ah_flags { /* In a sleepable context */ RDMA_DESTROY_AH_SLEEPABLE = BIT(0), }; /** * rdma_destroy_ah_user - Destroys an address handle. * @ah: The address handle to destroy. * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags). * @udata: Valid user data or NULL for kernel objects */ int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata); /** * rdma_destroy_ah - Destroys an kernel address handle. * @ah: The address handle to destroy. * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags). * * NOTE: for user ah use rdma_destroy_ah_user with valid udata! */ static inline void rdma_destroy_ah(struct ib_ah *ah, u32 flags) { int ret = rdma_destroy_ah_user(ah, flags, NULL); WARN_ONCE(ret, "Destroy of kernel AH shouldn't fail"); } struct ib_srq *ib_create_srq_user(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_usrq_object *uobject, struct ib_udata *udata); static inline struct ib_srq * ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr) { if (!pd->device->ops.create_srq) return ERR_PTR(-EOPNOTSUPP); return ib_create_srq_user(pd, srq_init_attr, NULL, NULL); } /** * ib_modify_srq - Modifies the attributes for the specified SRQ. * @srq: The SRQ to modify. * @srq_attr: On input, specifies the SRQ attributes to modify. On output, * the current values of selected SRQ attributes are returned. * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ * are being modified. * * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or * IB_SRQ_LIMIT to set the SRQ's limit and request notification when * the number of receives queued drops below the limit. */ int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask); /** * ib_query_srq - Returns the attribute list and current values for the * specified SRQ. * @srq: The SRQ to query. * @srq_attr: The attributes of the specified SRQ. */ int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); /** * ib_destroy_srq_user - Destroys the specified SRQ. * @srq: The SRQ to destroy. * @udata: Valid user data or NULL for kernel objects */ int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata); /** * ib_destroy_srq - Destroys the specified kernel SRQ. * @srq: The SRQ to destroy. * * NOTE: for user srq use ib_destroy_srq_user with valid udata! */ static inline void ib_destroy_srq(struct ib_srq *srq) { int ret = ib_destroy_srq_user(srq, NULL); WARN_ONCE(ret, "Destroy of kernel SRQ shouldn't fail"); } /** * ib_post_srq_recv - Posts a list of work requests to the specified SRQ. * @srq: The SRQ to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr) { const struct ib_recv_wr *dummy; return srq->device->ops.post_srq_recv(srq, recv_wr, bad_recv_wr ? : &dummy); } struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, const char *caller); /** * ib_create_qp - Creates a kernel QP associated with the specific protection * domain. * @pd: The protection domain associated with the QP. * @init_attr: A list of initial attributes required to create the * QP. If QP creation succeeds, then the attributes are updated to * the actual capabilities of the created QP. */ static inline struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr) { return ib_create_qp_kernel(pd, init_attr, KBUILD_MODNAME); } /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. * @qp: The QP to modify. * @attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. * @udata: pointer to user's input output buffer information * are being modified. * It returns 0 on success and returns appropriate error code on error. */ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); /** * ib_modify_qp - Modifies the attributes for the specified QP and then * transitions the QP to the given state. * @qp: The QP to modify. * @qp_attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @qp_attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. */ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask); /** * ib_query_qp - Returns the attribute list and current values for the * specified QP. * @qp: The QP to query. * @qp_attr: The attributes of the specified QP. * @qp_attr_mask: A bit-mask used to select specific attributes to query. * @qp_init_attr: Additional attributes of the selected QP. * * The qp_attr_mask may be used to limit the query to gathering only the * selected attributes. */ int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); /** * ib_destroy_qp - Destroys the specified QP. * @qp: The QP to destroy. * @udata: Valid udata or NULL for kernel objects */ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata); /** * ib_destroy_qp - Destroys the specified kernel QP. * @qp: The QP to destroy. * * NOTE: for user qp use ib_destroy_qp_user with valid udata! */ static inline int ib_destroy_qp(struct ib_qp *qp) { return ib_destroy_qp_user(qp, NULL); } /** * ib_open_qp - Obtain a reference to an existing sharable QP. * @xrcd: XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. */ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr); /** * ib_close_qp - Release an external reference to a QP. * @qp: The QP handle to release * * The opened QP handle is released by the caller. The underlying * shared QP is not destroyed until all internal references are released. */ int ib_close_qp(struct ib_qp *qp); /** * ib_post_send - Posts a list of work requests to the send queue of * the specified QP. * @qp: The QP to post the work request on. * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. * * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate * error is returned, the QP state shall not be affected, * ib_post_send() will return an immediate error after queueing any * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr) { const struct ib_send_wr *dummy; return qp->device->ops.post_send(qp, send_wr, bad_send_wr ? : &dummy); } /** * ib_post_recv - Posts a list of work requests to the receive queue of * the specified QP. * @qp: The QP to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr) { const struct ib_recv_wr *dummy; return qp->device->ops.post_recv(qp, recv_wr, bad_recv_wr ? : &dummy); } struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx, const char *caller); static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) { return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx, KBUILD_MODNAME); } struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, int nr_cqe, enum ib_poll_context poll_ctx, const char *caller); /** * ib_alloc_cq_any: Allocate kernel CQ * @dev: The IB device * @private: Private data attached to the CQE * @nr_cqe: Number of CQEs in the CQ * @poll_ctx: Context used for polling the CQ */ static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev, void *private, int nr_cqe, enum ib_poll_context poll_ctx) { return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx, KBUILD_MODNAME); } void ib_free_cq(struct ib_cq *cq); int ib_process_cq_direct(struct ib_cq *cq, int budget); /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. * @comp_handler: A user-specified callback that is invoked when a * completion event occurs on the CQ. * @event_handler: A user-specified callback that is invoked when an * asynchronous event not associated with a completion occurs on the CQ. * @cq_context: Context associated with the CQ returned to the user via * the associated completion and event handlers. * @cq_attr: The attributes the CQ should be created upon. * * Users can examine the cq structure to determine the actual CQ size. */ struct ib_cq *__ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, const struct ib_cq_init_attr *cq_attr, const char *caller); #define ib_create_cq(device, cmp_hndlr, evt_hndlr, cq_ctxt, cq_attr) \ __ib_create_cq((device), (cmp_hndlr), (evt_hndlr), (cq_ctxt), (cq_attr), KBUILD_MODNAME) /** * ib_resize_cq - Modifies the capacity of the CQ. * @cq: The CQ to resize. * @cqe: The minimum size of the CQ. * * Users can examine the cq structure to determine the actual CQ size. */ int ib_resize_cq(struct ib_cq *cq, int cqe); /** * rdma_set_cq_moderation - Modifies moderation params of the CQ * @cq: The CQ to modify. * @cq_count: number of CQEs that will trigger an event * @cq_period: max period of time in usec before triggering an event * */ int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period); /** * ib_destroy_cq_user - Destroys the specified CQ. * @cq: The CQ to destroy. * @udata: Valid user data or NULL for kernel objects */ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata); /** * ib_destroy_cq - Destroys the specified kernel CQ. * @cq: The CQ to destroy. * * NOTE: for user cq use ib_destroy_cq_user with valid udata! */ static inline void ib_destroy_cq(struct ib_cq *cq) { int ret = ib_destroy_cq_user(cq, NULL); WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); } /** * ib_poll_cq - poll a CQ for completion(s) * @cq:the CQ being polled * @num_entries:maximum number of completions to return * @wc:array of at least @num_entries &struct ib_wc where completions * will be returned * * Poll a CQ for (possibly multiple) completions. If the return value * is < 0, an error occurred. If the return value is >= 0, it is the * number of completions returned. If the return value is * non-negative and < num_entries, then the CQ was emptied. */ static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { return cq->device->ops.poll_cq(cq, num_entries, wc); } /** * ib_req_notify_cq - Request completion notification on a CQ. * @cq: The CQ to generate an event for. * @flags: * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP * to request an event on the next solicited event or next work * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS * may also be |ed in to request a hint about missed events, as * described below. * * Return Value: * < 0 means an error occurred while requesting notification * == 0 means notification was requested successfully, and if * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events * were missed and it is safe to wait for another event. In * this case is it guaranteed that any work completions added * to the CQ since the last CQ poll will trigger a completion * notification event. * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed * in. It means that the consumer must poll the CQ again to * make sure it is empty to avoid missing an event because of a * race between requesting notification and an entry being * added to the CQ. This return value means it is possible * (but not guaranteed) that a work completion has been added * to the CQ since the last poll without triggering a * completion notification event. */ static inline int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) { return cq->device->ops.req_notify_cq(cq, flags); } struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, int comp_vector_hint, enum ib_poll_context poll_ctx); void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe); /* * Drivers that don't need a DMA mapping at the RDMA layer, set dma_device to * NULL. This causes the ib_dma* helpers to just stash the kernel virtual * address into the dma address. */ static inline bool ib_uses_virt_dma(struct ib_device *dev) { return IS_ENABLED(CONFIG_INFINIBAND_VIRT_DMA) && !dev->dma_device; } /* * Check if a IB device's underlying DMA mapping supports P2PDMA transfers. */ static inline bool ib_dma_pci_p2p_dma_supported(struct ib_device *dev) { if (ib_uses_virt_dma(dev)) return false; return dma_pci_p2pdma_supported(dev->dma_device); } /** * ib_virt_dma_to_ptr - Convert a dma_addr to a kernel pointer * @dma_addr: The DMA address * * Used by ib_uses_virt_dma() devices to get back to the kernel pointer after * going through the dma_addr marshalling. */ static inline void *ib_virt_dma_to_ptr(u64 dma_addr) { /* virt_dma mode maps the kvs's directly into the dma addr */ return (void *)(uintptr_t)dma_addr; } /** * ib_virt_dma_to_page - Convert a dma_addr to a struct page * @dma_addr: The DMA address * * Used by ib_uses_virt_dma() device to get back to the struct page after going * through the dma_addr marshalling. */ static inline struct page *ib_virt_dma_to_page(u64 dma_addr) { return virt_to_page(ib_virt_dma_to_ptr(dma_addr)); } /** * ib_dma_mapping_error - check a DMA addr for error * @dev: The device for which the dma_addr was created * @dma_addr: The DMA address to check */ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) { if (ib_uses_virt_dma(dev)) return 0; return dma_mapping_error(dev->dma_device, dma_addr); } /** * ib_dma_map_single - Map a kernel virtual address to DMA address * @dev: The device for which the dma_addr is to be created * @cpu_addr: The kernel virtual address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) { if (ib_uses_virt_dma(dev)) return (uintptr_t)cpu_addr; return dma_map_single(dev->dma_device, cpu_addr, size, direction); } /** * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (!ib_uses_virt_dma(dev)) dma_unmap_single(dev->dma_device, addr, size, direction); } /** * ib_dma_map_page - Map a physical page to DMA address * @dev: The device for which the dma_addr is to be created * @page: The page to be mapped * @offset: The offset within the page * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction) { if (ib_uses_virt_dma(dev)) return (uintptr_t)(page_address(page) + offset); return dma_map_page(dev->dma_device, page, offset, size, direction); } /** * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (!ib_uses_virt_dma(dev)) dma_unmap_page(dev->dma_device, addr, size, direction); } int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents); static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, unsigned long dma_attrs) { if (ib_uses_virt_dma(dev)) return ib_dma_virt_map_sg(dev, sg, nents); return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, unsigned long dma_attrs) { if (!ib_uses_virt_dma(dev)) dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } /** * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sgt: The sg_table object describing the buffer * @direction: The direction of the DMA * @dma_attrs: Optional DMA attributes for the map operation */ static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, enum dma_data_direction direction, unsigned long dma_attrs) { int nents; if (ib_uses_virt_dma(dev)) { nents = ib_dma_virt_map_sg(dev, sgt->sgl, sgt->orig_nents); if (!nents) return -EIO; sgt->nents = nents; return 0; } return dma_map_sgtable(dev->dma_device, sgt, direction, dma_attrs); } static inline void ib_dma_unmap_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, enum dma_data_direction direction, unsigned long dma_attrs) { if (!ib_uses_virt_dma(dev)) dma_unmap_sgtable(dev->dma_device, sgt, direction, dma_attrs); } /** * ib_dma_map_sg - Map a scatter/gather list to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { return ib_dma_map_sg_attrs(dev, sg, nents, direction, 0); } /** * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses * @dev: The device for which the DMA addresses were created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { ib_dma_unmap_sg_attrs(dev, sg, nents, direction, 0); } /** * ib_dma_max_seg_size - Return the size limit of a single DMA transfer * @dev: The device to query * * The returned value represents a size in bytes. */ static inline unsigned int ib_dma_max_seg_size(struct ib_device *dev) { if (ib_uses_virt_dma(dev)) return UINT_MAX; return dma_get_max_seg_size(dev->dma_device); } /** * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (!ib_uses_virt_dma(dev)) dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); } /** * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (!ib_uses_virt_dma(dev)) dma_sync_single_for_device(dev->dma_device, addr, size, dir); } /* ib_reg_user_mr - register a memory region for virtual addresses from kernel * space. This function should be called when 'current' is the owning MM. */ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags); /* ib_advise_mr - give an advice about an address range in a memory region */ int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge); /** * ib_dereg_mr_user - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * @udata: Valid user data or NULL for kernel object * * This function can fail, if the memory region has memory windows bound to it. */ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata); /** * ib_dereg_mr - Deregisters a kernel memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * * This function can fail, if the memory region has memory windows bound to it. * * NOTE: for user mr use ib_dereg_mr_user with valid udata! */ static inline int ib_dereg_mr(struct ib_mr *mr) { return ib_dereg_mr_user(mr, NULL); } struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg); /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. * @mr: struct ib_mr pointer to be updated. * @newkey: new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { mr->lkey = (mr->lkey & 0xffffff00) | newkey; mr->rkey = (mr->rkey & 0xffffff00) | newkey; } /** * ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. * @rkey: the rkey to increment. */ static inline u32 ib_inc_rkey(u32 rkey) { const u32 mask = 0x000000ff; return ((rkey + 1) & mask) | (rkey & ~mask); } /** * ib_attach_mcast - Attaches the specified QP to a multicast group. * @qp: QP to attach to the multicast group. The QP must be type * IB_QPT_UD. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. * * In order to send and receive multicast packets, subnet * administration must have created the multicast group and configured * the fabric appropriately. The port associated with the specified * QP must also be a member of the multicast group. */ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_detach_mcast - Detaches the specified QP from a multicast group. * @qp: QP to detach from the multicast group. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device, struct inode *inode, struct ib_udata *udata); int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata); static inline int ib_check_mr_access(struct ib_device *ib_dev, unsigned int flags) { u64 device_cap = ib_dev->attrs.device_cap_flags; /* * Local write permission is required if remote write or * remote atomic permission is also requested. */ if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && !(flags & IB_ACCESS_LOCAL_WRITE)) return -EINVAL; if (flags & ~IB_ACCESS_SUPPORTED) return -EINVAL; if (flags & IB_ACCESS_ON_DEMAND && !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) return -EOPNOTSUPP; if ((flags & IB_ACCESS_FLUSH_GLOBAL && !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) || (flags & IB_ACCESS_FLUSH_PERSISTENT && !(device_cap & IB_DEVICE_FLUSH_PERSISTENT))) return -EOPNOTSUPP; return 0; } static inline bool ib_access_writable(int access_flags) { /* * We have writable memory backing the MR if any of the following * access flags are set. "Local write" and "remote write" obviously * require write access. "Remote atomic" can do things like fetch and * add, which will modify memory, and "MW bind" can change permissions * by binding a window. */ return access_flags & (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND); } /** * ib_check_mr_status: lightweight check of MR status. * This routine may provide status checks on a selected * ib_mr. first use is for signature status check. * * @mr: A memory region. * @check_mask: Bitmask of which checks to perform from * ib_mr_status_check enumeration. * @mr_status: The container of relevant status checks. * failed checks will be indicated in the status bitmask * and the relevant info shall be in the error item. */ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); /** * ib_device_try_get: Hold a registration lock * @dev: The device to lock * * A device under an active registration lock cannot become unregistered. It * is only possible to obtain a registration lock on a device that is fully * registered, otherwise this function returns false. * * The registration lock is only necessary for actions which require the * device to still be registered. Uses that only require the device pointer to * be valid should use get_device(&ibdev->dev) to hold the memory. * */ static inline bool ib_device_try_get(struct ib_device *dev) { return refcount_inc_not_zero(&dev->refcount); } void ib_device_put(struct ib_device *device); struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id); struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, unsigned int port); struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port); int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port); static inline enum ib_port_state ib_get_curr_port_state(struct net_device *net_dev) { return (netif_running(net_dev) && netif_carrier_ok(net_dev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; } void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size); int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset, unsigned int page_size); static inline int ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size) { int n; n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size); mr->iova = 0; return n; } int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64)); void ib_drain_rq(struct ib_qp *qp); void ib_drain_sq(struct ib_qp *qp); void ib_drain_qp(struct ib_qp *qp); int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width); static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_ROCE) return attr->roce.dmac; return NULL; } static inline void rdma_ah_set_dlid(struct rdma_ah_attr *attr, u32 dlid) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) attr->ib.dlid = (u16)dlid; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.dlid = dlid; } static inline u32 rdma_ah_get_dlid(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) return attr->ib.dlid; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.dlid; return 0; } static inline void rdma_ah_set_sl(struct rdma_ah_attr *attr, u8 sl) { attr->sl = sl; } static inline u8 rdma_ah_get_sl(const struct rdma_ah_attr *attr) { return attr->sl; } static inline void rdma_ah_set_path_bits(struct rdma_ah_attr *attr, u8 src_path_bits) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) attr->ib.src_path_bits = src_path_bits; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.src_path_bits = src_path_bits; } static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) return attr->ib.src_path_bits; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.src_path_bits; return 0; } static inline void rdma_ah_set_make_grd(struct rdma_ah_attr *attr, bool make_grd) { if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.make_grd = make_grd; } static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.make_grd; return false; } static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u32 port_num) { attr->port_num = port_num; } static inline u32 rdma_ah_get_port_num(const struct rdma_ah_attr *attr) { return attr->port_num; } static inline void rdma_ah_set_static_rate(struct rdma_ah_attr *attr, u8 static_rate) { attr->static_rate = static_rate; } static inline u8 rdma_ah_get_static_rate(const struct rdma_ah_attr *attr) { return attr->static_rate; } static inline void rdma_ah_set_ah_flags(struct rdma_ah_attr *attr, enum ib_ah_flags flag) { attr->ah_flags = flag; } static inline enum ib_ah_flags rdma_ah_get_ah_flags(const struct rdma_ah_attr *attr) { return attr->ah_flags; } static inline const struct ib_global_route *rdma_ah_read_grh(const struct rdma_ah_attr *attr) { return &attr->grh; } /*To retrieve and modify the grh */ static inline struct ib_global_route *rdma_ah_retrieve_grh(struct rdma_ah_attr *attr) { return &attr->grh; } static inline void rdma_ah_set_dgid_raw(struct rdma_ah_attr *attr, void *dgid) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); memcpy(grh->dgid.raw, dgid, sizeof(grh->dgid)); } static inline void rdma_ah_set_subnet_prefix(struct rdma_ah_attr *attr, __be64 prefix) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); grh->dgid.global.subnet_prefix = prefix; } static inline void rdma_ah_set_interface_id(struct rdma_ah_attr *attr, __be64 if_id) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); grh->dgid.global.interface_id = if_id; } static inline void rdma_ah_set_grh(struct rdma_ah_attr *attr, union ib_gid *dgid, u32 flow_label, u8 sgid_index, u8 hop_limit, u8 traffic_class) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); attr->ah_flags = IB_AH_GRH; if (dgid) grh->dgid = *dgid; grh->flow_label = flow_label; grh->sgid_index = sgid_index; grh->hop_limit = hop_limit; grh->traffic_class = traffic_class; grh->sgid_attr = NULL; } void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr); void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid, u32 flow_label, u8 hop_limit, u8 traffic_class, const struct ib_gid_attr *sgid_attr); void rdma_copy_ah_attr(struct rdma_ah_attr *dest, const struct rdma_ah_attr *src); void rdma_replace_ah_attr(struct rdma_ah_attr *old, const struct rdma_ah_attr *new); void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src); /** * rdma_ah_find_type - Return address handle type. * * @dev: Device to be checked * @port_num: Port number */ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, u32 port_num) { if (rdma_protocol_roce(dev, port_num)) return RDMA_AH_ATTR_TYPE_ROCE; if (rdma_protocol_ib(dev, port_num)) { if (rdma_cap_opa_ah(dev, port_num)) return RDMA_AH_ATTR_TYPE_OPA; return RDMA_AH_ATTR_TYPE_IB; } if (dev->type == RDMA_DEVICE_TYPE_SMI) return RDMA_AH_ATTR_TYPE_IB; return RDMA_AH_ATTR_TYPE_UNDEFINED; } /** * ib_lid_cpu16 - Return lid in 16bit CPU encoding. * In the current implementation the only way to * get the 32bit lid is from other sources for OPA. * For IB, lids will always be 16bits so cast the * value accordingly. * * @lid: A 32bit LID */ static inline u16 ib_lid_cpu16(u32 lid) { WARN_ON_ONCE(lid & 0xFFFF0000); return (u16)lid; } /** * ib_lid_be16 - Return lid in 16bit BE encoding. * * @lid: A 32bit LID */ static inline __be16 ib_lid_be16(u32 lid) { WARN_ON_ONCE(lid & 0xFFFF0000); return cpu_to_be16((u16)lid); } /** * ib_get_vector_affinity - Get the affinity mappings of a given completion * vector * @device: the rdma device * @comp_vector: index of completion vector * * Returns NULL on failure, otherwise a corresponding cpu map of the * completion vector (returns all-cpus map if the device driver doesn't * implement get_vector_affinity). */ static inline const struct cpumask * ib_get_vector_affinity(struct ib_device *device, int comp_vector) { if (comp_vector < 0 || comp_vector >= device->num_comp_vectors || !device->ops.get_vector_affinity) return NULL; return device->ops.get_vector_affinity(device, comp_vector); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ibdev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ibdev); void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port); void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs); #else static inline int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs) { return 0; } static inline bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs) { return false; } #endif struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)); int rdma_init_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), struct net_device *netdev); /** * rdma_device_to_ibdev - Get ib_device pointer from device pointer * * @device: device pointer for which ib_device pointer to retrieve * * rdma_device_to_ibdev() retrieves ib_device pointer from device. * */ static inline struct ib_device *rdma_device_to_ibdev(struct device *device) { struct ib_core_device *coredev = container_of(device, struct ib_core_device, dev); return coredev->owner; } /** * ibdev_to_node - return the NUMA node for a given ib_device * @ibdev: device to get the NUMA node for. */ static inline int ibdev_to_node(struct ib_device *ibdev) { struct device *parent = ibdev->dev.parent; if (!parent) return NUMA_NO_NODE; return dev_to_node(parent); } /** * rdma_device_to_drv_device - Helper macro to reach back to driver's * ib_device holder structure from device pointer. * * NOTE: New drivers should not make use of this API; This API is only for * existing drivers who have exposed sysfs entries using * ops->device_group. */ #define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member) \ container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member) bool rdma_dev_access_netns(const struct ib_device *device, const struct net *net); bool rdma_dev_has_raw_cap(const struct ib_device *dev); static inline struct net *rdma_dev_net(struct ib_device *device) { return read_pnet(&device->coredev.rdma_net); } #define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000) #define IB_ROCE_UDP_ENCAP_VALID_PORT_MAX (0xFFFF) #define IB_GRH_FLOWLABEL_MASK (0x000FFFFF) /** * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based * on the flow_label * @fl: flow_label value * * This function will convert the 20 bit flow_label input to a valid RoCE v2 * UDP src port 14 bit value. All RoCE V2 drivers should use this same * convention. */ static inline u16 rdma_flow_label_to_udp_sport(u32 fl) { u32 fl_low = fl & 0x03fff, fl_high = fl & 0xFC000; fl_low ^= fl_high >> 14; return (u16)(fl_low | IB_ROCE_UDP_ENCAP_VALID_PORT_MIN); } /** * rdma_calc_flow_label - generate a RDMA symmetric flow label value based on * local and remote qpn values * * This function folded the multiplication results of two qpns, 24 bit each, * fields, and converts it to a 20 bit results. * * This function will create symmetric flow_label value based on the local * and remote qpn values. this will allow both the requester and responder * to calculate the same flow_label for a given connection. * * This helper function should be used by driver in case the upper layer * provide a zero flow_label value. This is to improve entropy of RDMA * traffic in the network. */ static inline u32 rdma_calc_flow_label(u32 lqpn, u32 rqpn) { u64 v = (u64)lqpn * rqpn; v ^= v >> 20; v ^= v >> 40; return (u32)(v & IB_GRH_FLOWLABEL_MASK); } /** * rdma_get_udp_sport - Calculate and set UDP source port based on the flow * label. If flow label is not defined in GRH then * calculate it based on lqpn/rqpn. * * @fl: flow label from GRH * @lqpn: local qp number * @rqpn: remote qp number */ static inline u16 rdma_get_udp_sport(u32 fl, u32 lqpn, u32 rqpn) { if (!fl) fl = rdma_calc_flow_label(lqpn, rqpn); return rdma_flow_label_to_udp_sport(fl); } const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port); /** ib_add_sub_device - Add a sub IB device on an existing one * * @parent: The IB device that needs to add a sub device * @type: The type of the new sub device * @name: The name of the new sub device * * * Return 0 on success, an error code otherwise */ int ib_add_sub_device(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name); /** ib_del_sub_device_and_put - Delect an IB sub device while holding a 'get' * * @sub: The sub device that is going to be deleted * * Return 0 on success, an error code otherwise */ int ib_del_sub_device_and_put(struct ib_device *sub); static inline void ib_mark_name_assigned_by_user(struct ib_device *ibdev) { ibdev->name_assign_type = RDMA_NAME_ASSIGN_TYPE_USER; } #endif /* IB_VERBS_H */
4 6 5 4 4 6 3 20 19 19 2 2 1 17 6 5 4 11 3 2 1 6 5 1 20 14 5 5 5 5 5 5 5 5 7 6 6 6 6 6 7 7 7 7 7 5 5 8 9 2 7 7 6 6 3 6 5 4 3 3 7 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 // SPDX-License-Identifier: GPL-2.0-or-later /* Asymmetric public-key cryptography key type * * See Documentation/crypto/asymmetric-keys.rst * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <keys/asymmetric-subtype.h> #include <keys/asymmetric-parser.h> #include <crypto/public_key.h> #include <linux/seq_file.h> #include <linux/module.h> #include <linux/overflow.h> #include <linux/slab.h> #include <linux/ctype.h> #include <keys/system_keyring.h> #include <keys/user-type.h> #include "asymmetric_keys.h" static LIST_HEAD(asymmetric_key_parsers); static DECLARE_RWSEM(asymmetric_key_parsers_sem); /** * find_asymmetric_key - Find a key by ID. * @keyring: The keys to search. * @id_0: The first ID to look for or NULL. * @id_1: The second ID to look for or NULL, matched together with @id_0 * against @keyring keys' id[0] and id[1]. * @id_2: The fallback ID to match against @keyring keys' id[2] if both of the * other IDs are NULL. * @partial: Use partial match for @id_0 and @id_1 if true, exact if false. * * Find a key in the given keyring by identifier. The preferred identifier is * the id_0 and the fallback identifier is the id_1. If both are given, the * former is matched (exactly or partially) against either of the sought key's * identifiers and the latter must match the found key's second identifier * exactly. If both are missing, id_2 must match the sought key's third * identifier exactly. */ struct key *find_asymmetric_key(struct key *keyring, const struct asymmetric_key_id *id_0, const struct asymmetric_key_id *id_1, const struct asymmetric_key_id *id_2, bool partial) { struct key *key; key_ref_t ref; const char *lookup; char *req, *p; int len; if (id_0) { lookup = id_0->data; len = id_0->len; } else if (id_1) { lookup = id_1->data; len = id_1->len; } else if (id_2) { lookup = id_2->data; len = id_2->len; } else { WARN_ON(1); return ERR_PTR(-EINVAL); } /* Construct an identifier "id:<keyid>". */ p = req = kmalloc(2 + 1 + len * 2 + 1, GFP_KERNEL); if (!req) return ERR_PTR(-ENOMEM); if (!id_0 && !id_1) { *p++ = 'd'; *p++ = 'n'; } else if (partial) { *p++ = 'i'; *p++ = 'd'; } else { *p++ = 'e'; *p++ = 'x'; } *p++ = ':'; p = bin2hex(p, lookup, len); *p = 0; pr_debug("Look up: \"%s\"\n", req); ref = keyring_search(make_key_ref(keyring, 1), &key_type_asymmetric, req, true); if (IS_ERR(ref)) pr_debug("Request for key '%s' err %ld\n", req, PTR_ERR(ref)); kfree(req); if (IS_ERR(ref)) { switch (PTR_ERR(ref)) { /* Hide some search errors */ case -EACCES: case -ENOTDIR: case -EAGAIN: return ERR_PTR(-ENOKEY); default: return ERR_CAST(ref); } } key = key_ref_to_ptr(ref); if (id_0 && id_1) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); if (!kids->id[1]) { pr_debug("First ID matches, but second is missing\n"); goto reject; } if (!asymmetric_key_id_same(id_1, kids->id[1])) { pr_debug("First ID matches, but second does not\n"); goto reject; } } pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key)); return key; reject: key_put(key); return ERR_PTR(-EKEYREJECTED); } EXPORT_SYMBOL_GPL(find_asymmetric_key); /** * asymmetric_key_generate_id: Construct an asymmetric key ID * @val_1: First binary blob * @len_1: Length of first binary blob * @val_2: Second binary blob * @len_2: Length of second binary blob * * Construct an asymmetric key ID from a pair of binary blobs. */ struct asymmetric_key_id *asymmetric_key_generate_id(const void *val_1, size_t len_1, const void *val_2, size_t len_2) { struct asymmetric_key_id *kid; size_t kid_sz; size_t len; if (check_add_overflow(len_1, len_2, &len)) return ERR_PTR(-EOVERFLOW); if (check_add_overflow(sizeof(struct asymmetric_key_id), len, &kid_sz)) return ERR_PTR(-EOVERFLOW); kid = kmalloc(kid_sz, GFP_KERNEL); if (!kid) return ERR_PTR(-ENOMEM); kid->len = len; memcpy(kid->data, val_1, len_1); memcpy(kid->data + len_1, val_2, len_2); return kid; } EXPORT_SYMBOL_GPL(asymmetric_key_generate_id); /** * asymmetric_key_id_same - Return true if two asymmetric keys IDs are the same. * @kid1: The key ID to compare * @kid2: The key ID to compare */ bool asymmetric_key_id_same(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2) { if (!kid1 || !kid2) return false; if (kid1->len != kid2->len) return false; return memcmp(kid1->data, kid2->data, kid1->len) == 0; } EXPORT_SYMBOL_GPL(asymmetric_key_id_same); /** * asymmetric_key_id_partial - Return true if two asymmetric keys IDs * partially match * @kid1: The key ID to compare * @kid2: The key ID to compare */ bool asymmetric_key_id_partial(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2) { if (!kid1 || !kid2) return false; if (kid1->len < kid2->len) return false; return memcmp(kid1->data + (kid1->len - kid2->len), kid2->data, kid2->len) == 0; } EXPORT_SYMBOL_GPL(asymmetric_key_id_partial); /** * asymmetric_match_key_ids - Search asymmetric key IDs 1 & 2 * @kids: The pair of key IDs to check * @match_id: The key ID we're looking for * @match: The match function to use */ static bool asymmetric_match_key_ids( const struct asymmetric_key_ids *kids, const struct asymmetric_key_id *match_id, bool (*match)(const struct asymmetric_key_id *kid1, const struct asymmetric_key_id *kid2)) { int i; if (!kids || !match_id) return false; for (i = 0; i < 2; i++) if (match(kids->id[i], match_id)) return true; return false; } /* helper function can be called directly with pre-allocated memory */ inline int __asymmetric_key_hex_to_key_id(const char *id, struct asymmetric_key_id *match_id, size_t hexlen) { match_id->len = hexlen; return hex2bin(match_id->data, id, hexlen); } /** * asymmetric_key_hex_to_key_id - Convert a hex string into a key ID. * @id: The ID as a hex string. */ struct asymmetric_key_id *asymmetric_key_hex_to_key_id(const char *id) { struct asymmetric_key_id *match_id; size_t asciihexlen; int ret; if (!*id) return ERR_PTR(-EINVAL); asciihexlen = strlen(id); if (asciihexlen & 1) return ERR_PTR(-EINVAL); match_id = kmalloc(sizeof(struct asymmetric_key_id) + asciihexlen / 2, GFP_KERNEL); if (!match_id) return ERR_PTR(-ENOMEM); ret = __asymmetric_key_hex_to_key_id(id, match_id, asciihexlen / 2); if (ret < 0) { kfree(match_id); return ERR_PTR(-EINVAL); } return match_id; } /* * Match asymmetric keys by an exact match on one of the first two IDs. */ static bool asymmetric_key_cmp(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return asymmetric_match_key_ids(kids, match_id, asymmetric_key_id_same); } /* * Match asymmetric keys by a partial match on one of the first two IDs. */ static bool asymmetric_key_cmp_partial(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return asymmetric_match_key_ids(kids, match_id, asymmetric_key_id_partial); } /* * Match asymmetric keys by an exact match on the third IDs. */ static bool asymmetric_key_cmp_name(const struct key *key, const struct key_match_data *match_data) { const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *match_id = match_data->preparsed; return kids && asymmetric_key_id_same(kids->id[2], match_id); } /* * Preparse the match criterion. If we don't set lookup_type and cmp, * the default will be an exact match on the key description. * * There are some specifiers for matching key IDs rather than by the key * description: * * "id:<id>" - find a key by partial match on one of the first two IDs * "ex:<id>" - find a key by exact match on one of the first two IDs * "dn:<id>" - find a key by exact match on the third ID * * These have to be searched by iteration rather than by direct lookup because * the key is hashed according to its description. */ static int asymmetric_key_match_preparse(struct key_match_data *match_data) { struct asymmetric_key_id *match_id; const char *spec = match_data->raw_data; const char *id; bool (*cmp)(const struct key *, const struct key_match_data *) = asymmetric_key_cmp; if (!spec || !*spec) return -EINVAL; if (spec[0] == 'i' && spec[1] == 'd' && spec[2] == ':') { id = spec + 3; cmp = asymmetric_key_cmp_partial; } else if (spec[0] == 'e' && spec[1] == 'x' && spec[2] == ':') { id = spec + 3; } else if (spec[0] == 'd' && spec[1] == 'n' && spec[2] == ':') { id = spec + 3; cmp = asymmetric_key_cmp_name; } else { goto default_match; } match_id = asymmetric_key_hex_to_key_id(id); if (IS_ERR(match_id)) return PTR_ERR(match_id); match_data->preparsed = match_id; match_data->cmp = cmp; match_data->lookup_type = KEYRING_SEARCH_LOOKUP_ITERATE; return 0; default_match: return 0; } /* * Free the preparsed the match criterion. */ static void asymmetric_key_match_free(struct key_match_data *match_data) { kfree(match_data->preparsed); } /* * Describe the asymmetric key */ static void asymmetric_key_describe(const struct key *key, struct seq_file *m) { const struct asymmetric_key_subtype *subtype = asymmetric_key_subtype(key); const struct asymmetric_key_ids *kids = asymmetric_key_ids(key); const struct asymmetric_key_id *kid; const unsigned char *p; int n; seq_puts(m, key->description); if (subtype) { seq_puts(m, ": "); subtype->describe(key, m); if (kids && kids->id[1]) { kid = kids->id[1]; seq_putc(m, ' '); n = kid->len; p = kid->data; if (n > 4) { p += n - 4; n = 4; } seq_printf(m, "%*phN", n, p); } seq_puts(m, " ["); /* put something here to indicate the key's capabilities */ seq_putc(m, ']'); } } /* * Preparse a asymmetric payload to get format the contents appropriately for the * internal payload to cut down on the number of scans of the data performed. * * We also generate a proposed description from the contents of the key that * can be used to name the key if the user doesn't want to provide one. */ static int asymmetric_key_preparse(struct key_preparsed_payload *prep) { struct asymmetric_key_parser *parser; int ret; pr_devel("==>%s()\n", __func__); if (prep->datalen == 0) return -EINVAL; down_read(&asymmetric_key_parsers_sem); ret = -EBADMSG; list_for_each_entry(parser, &asymmetric_key_parsers, link) { pr_debug("Trying parser '%s'\n", parser->name); ret = parser->parse(prep); if (ret != -EBADMSG) { pr_debug("Parser recognised the format (ret %d)\n", ret); break; } } up_read(&asymmetric_key_parsers_sem); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /* * Clean up the key ID list */ static void asymmetric_key_free_kids(struct asymmetric_key_ids *kids) { int i; if (kids) { for (i = 0; i < ARRAY_SIZE(kids->id); i++) kfree(kids->id[i]); kfree(kids); } } /* * Clean up the preparse data */ static void asymmetric_key_free_preparse(struct key_preparsed_payload *prep) { struct asymmetric_key_subtype *subtype = prep->payload.data[asym_subtype]; struct asymmetric_key_ids *kids = prep->payload.data[asym_key_ids]; pr_devel("==>%s()\n", __func__); if (subtype) { subtype->destroy(prep->payload.data[asym_crypto], prep->payload.data[asym_auth]); module_put(subtype->owner); } asymmetric_key_free_kids(kids); kfree(prep->description); } /* * dispose of the data dangling from the corpse of a asymmetric key */ static void asymmetric_key_destroy(struct key *key) { struct asymmetric_key_subtype *subtype = asymmetric_key_subtype(key); struct asymmetric_key_ids *kids = key->payload.data[asym_key_ids]; void *data = key->payload.data[asym_crypto]; void *auth = key->payload.data[asym_auth]; key->payload.data[asym_crypto] = NULL; key->payload.data[asym_subtype] = NULL; key->payload.data[asym_key_ids] = NULL; key->payload.data[asym_auth] = NULL; if (subtype) { subtype->destroy(data, auth); module_put(subtype->owner); } asymmetric_key_free_kids(kids); } static struct key_restriction *asymmetric_restriction_alloc( key_restrict_link_func_t check, struct key *key) { struct key_restriction *keyres = kzalloc(sizeof(struct key_restriction), GFP_KERNEL); if (!keyres) return ERR_PTR(-ENOMEM); keyres->check = check; keyres->key = key; keyres->keytype = &key_type_asymmetric; return keyres; } /* * look up keyring restrict functions for asymmetric keys */ static struct key_restriction *asymmetric_lookup_restriction( const char *restriction) { char *restrict_method; char *parse_buf; char *next; struct key_restriction *ret = ERR_PTR(-EINVAL); if (strcmp("builtin_trusted", restriction) == 0) return asymmetric_restriction_alloc( restrict_link_by_builtin_trusted, NULL); if (strcmp("builtin_and_secondary_trusted", restriction) == 0) return asymmetric_restriction_alloc( restrict_link_by_builtin_and_secondary_trusted, NULL); parse_buf = kstrndup(restriction, PAGE_SIZE, GFP_KERNEL); if (!parse_buf) return ERR_PTR(-ENOMEM); next = parse_buf; restrict_method = strsep(&next, ":"); if ((strcmp(restrict_method, "key_or_keyring") == 0) && next) { char *key_text; key_serial_t serial; struct key *key; key_restrict_link_func_t link_fn = restrict_link_by_key_or_keyring; bool allow_null_key = false; key_text = strsep(&next, ":"); if (next) { if (strcmp(next, "chain") != 0) goto out; link_fn = restrict_link_by_key_or_keyring_chain; allow_null_key = true; } if (kstrtos32(key_text, 0, &serial) < 0) goto out; if ((serial == 0) && allow_null_key) { key = NULL; } else { key = key_lookup(serial); if (IS_ERR(key)) { ret = ERR_CAST(key); goto out; } } ret = asymmetric_restriction_alloc(link_fn, key); if (IS_ERR(ret)) key_put(key); } out: kfree(parse_buf); return ret; } int asymmetric_key_eds_op(struct kernel_pkey_params *params, const void *in, void *out) { const struct asymmetric_key_subtype *subtype; struct key *key = params->key; int ret; pr_devel("==>%s()\n", __func__); if (key->type != &key_type_asymmetric) return -EINVAL; subtype = asymmetric_key_subtype(key); if (!subtype || !key->payload.data[0]) return -EINVAL; if (!subtype->eds_op) return -ENOTSUPP; ret = subtype->eds_op(params, in, out); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } static int asymmetric_key_verify_signature(struct kernel_pkey_params *params, const void *in, const void *in2) { struct public_key_signature sig = { .s_size = params->in2_len, .digest_size = params->in_len, .encoding = params->encoding, .hash_algo = params->hash_algo, .digest = (void *)in, .s = (void *)in2, }; return verify_signature(params->key, &sig); } struct key_type key_type_asymmetric = { .name = "asymmetric", .preparse = asymmetric_key_preparse, .free_preparse = asymmetric_key_free_preparse, .instantiate = generic_key_instantiate, .match_preparse = asymmetric_key_match_preparse, .match_free = asymmetric_key_match_free, .destroy = asymmetric_key_destroy, .describe = asymmetric_key_describe, .lookup_restriction = asymmetric_lookup_restriction, .asym_query = query_asymmetric_key, .asym_eds_op = asymmetric_key_eds_op, .asym_verify_signature = asymmetric_key_verify_signature, }; EXPORT_SYMBOL_GPL(key_type_asymmetric); /** * register_asymmetric_key_parser - Register a asymmetric key blob parser * @parser: The parser to register */ int register_asymmetric_key_parser(struct asymmetric_key_parser *parser) { struct asymmetric_key_parser *cursor; int ret; down_write(&asymmetric_key_parsers_sem); list_for_each_entry(cursor, &asymmetric_key_parsers, link) { if (strcmp(cursor->name, parser->name) == 0) { pr_err("Asymmetric key parser '%s' already registered\n", parser->name); ret = -EEXIST; goto out; } } list_add_tail(&parser->link, &asymmetric_key_parsers); pr_notice("Asymmetric key parser '%s' registered\n", parser->name); ret = 0; out: up_write(&asymmetric_key_parsers_sem); return ret; } EXPORT_SYMBOL_GPL(register_asymmetric_key_parser); /** * unregister_asymmetric_key_parser - Unregister a asymmetric key blob parser * @parser: The parser to unregister */ void unregister_asymmetric_key_parser(struct asymmetric_key_parser *parser) { down_write(&asymmetric_key_parsers_sem); list_del(&parser->link); up_write(&asymmetric_key_parsers_sem); pr_notice("Asymmetric key parser '%s' unregistered\n", parser->name); } EXPORT_SYMBOL_GPL(unregister_asymmetric_key_parser); /* * Module stuff */ static int __init asymmetric_key_init(void) { return register_key_type(&key_type_asymmetric); } static void __exit asymmetric_key_cleanup(void) { unregister_key_type(&key_type_asymmetric); } module_init(asymmetric_key_init); module_exit(asymmetric_key_cleanup);
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * Copyright (c) 2005 Mellanox Technologies. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/device.h> #include <linux/err.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/cdev.h> #include <linux/anon_inodes.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> #include <rdma/ib.h> #include <rdma/uverbs_std_types.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_ucaps.h> #include "uverbs.h" #include "core_priv.h" #include "rdma_core.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace verbs access"); MODULE_LICENSE("Dual BSD/GPL"); enum { IB_UVERBS_MAJOR = 231, IB_UVERBS_BASE_MINOR = 192, IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS, IB_UVERBS_NUM_FIXED_MINOR = 32, IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR, }; #define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) static dev_t dynamic_uverbs_dev; static DEFINE_IDA(uverbs_ida); static int ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); static struct ib_client uverbs_client; static char *uverbs_devnode(const struct device *dev, umode_t *mode) { if (mode) *mode = 0666; return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); } static const struct class uverbs_class = { .name = "infiniband_verbs", .devnode = uverbs_devnode, }; /* * Must be called with the ufile->device->disassociate_srcu held, and the lock * must be held until use of the ucontext is finished. */ struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile) { /* * We do not hold the hw_destroy_rwsem lock for this flow, instead * srcu is used. It does not matter if someone races this with * get_context, we get NULL or valid ucontext. */ struct ib_ucontext *ucontext = smp_load_acquire(&ufile->ucontext); if (!srcu_dereference(ufile->device->ib_dev, &ufile->device->disassociate_srcu)) return ERR_PTR(-EIO); if (!ucontext) return ERR_PTR(-EINVAL); return ucontext; } EXPORT_SYMBOL(ib_uverbs_get_ucontext_file); int uverbs_dealloc_mw(struct ib_mw *mw) { struct ib_pd *pd = mw->pd; int ret; ret = mw->device->ops.dealloc_mw(mw); if (ret) return ret; atomic_dec(&pd->usecnt); kfree(mw); return ret; } static void ib_uverbs_release_dev(struct device *device) { struct ib_uverbs_device *dev = container_of(device, struct ib_uverbs_device, dev); uverbs_destroy_api(dev->uapi); cleanup_srcu_struct(&dev->disassociate_srcu); mutex_destroy(&dev->lists_mutex); mutex_destroy(&dev->xrcd_tree_mutex); kfree(dev); } void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file, struct ib_ucq_object *uobj) { struct ib_uverbs_event *evt, *tmp; if (ev_file) { spin_lock_irq(&ev_file->ev_queue.lock); list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) { list_del(&evt->list); kfree(evt); } spin_unlock_irq(&ev_file->ev_queue.lock); uverbs_uobject_put(&ev_file->uobj); } ib_uverbs_release_uevent(&uobj->uevent);