1 20 16 22 4 18 1 21 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 /* SPDX-License-Identifier: GPL-2.0 */ /****************************************************************************** * x86_emulate.h * * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. * * Copyright (c) 2005 Keir Fraser * * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 */ #ifndef _ASM_X86_KVM_X86_EMULATE_H #define _ASM_X86_KVM_X86_EMULATE_H #include <asm/desc_defs.h> #include "fpu.h" struct x86_emulate_ctxt; enum x86_intercept; enum x86_intercept_stage; struct x86_exception { u8 vector; bool error_code_valid; u16 error_code; bool nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ u8 async_page_fault; }; /* * This struct is used to carry enough information from the instruction * decoder to main KVM so that a decision can be made whether the * instruction needs to be intercepted or not. */ struct x86_instruction_info { u8 intercept; /* which intercept */ u8 rep_prefix; /* rep prefix? */ u8 modrm_mod; /* mod part of modrm */ u8 modrm_reg; /* index of register used */ u8 modrm_rm; /* rm part of modrm */ u64 src_val; /* value of source operand */ u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ u8 ad_bytes; /* size of src/dst address */ u64 next_rip; /* rip following the instruction */ }; /* * x86_emulate_ops: * * These operations represent the instruction emulator's interface to memory. * There are two categories of operation: those that act on ordinary memory * regions (*_std), and those that act on memory regions known to require * special treatment or emulation (*_emulated). * * The emulator assumes that an instruction accesses only one 'emulated memory' * location, that this location is the given linear faulting address (cr2), and * that this is one of the instruction's data operands. Instruction fetches and * stack operations are assumed never to access emulated memory. The emulator * automatically deduces which operand of a string-move operation is accessing * emulated memory, and assumes that the other operand accesses normal memory. * * NOTES: * 1. The emulator isn't very smart about emulated vs. standard memory. * 'Emulated memory' access addresses should be checked for sanity. * 'Normal memory' accesses may fault, and the caller must arrange to * detect and handle reentrancy into the emulator via recursive faults. * Accesses may be unaligned and may cross page boundaries. * 2. If the access fails (cannot emulate, or a standard access faults) then * it is up to the memop to propagate the fault to the guest VM via * some out-of-band mechanism, unknown to the emulator. The memop signals * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will * then immediately bail. * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only * cmpxchg8b_emulated need support 8-byte accesses. * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. */ /* Access completed successfully: continue emulation as normal. */ #define X86EMUL_CONTINUE 0 /* Access is unhandleable: bail from emulation and return error to caller. */ #define X86EMUL_UNHANDLEABLE 1 /* Terminate emulation but return success to the caller. */ #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ #define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ /* x86-specific emulation flags */ #define X86EMUL_F_WRITE BIT(0) #define X86EMUL_F_FETCH BIT(1) #define X86EMUL_F_IMPLICIT BIT(2) #define X86EMUL_F_INVLPG BIT(3) struct x86_emulate_ops { void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); /* * read_gpr: read a general purpose register (rax - r15) * * @reg: gpr number. */ ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg); /* * write_gpr: write a general purpose register (rax - r15) * * @reg: gpr number. * @val: value to write. */ void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val); /* * read_std: Read bytes of standard (non-emulated/special) memory. * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. * @system:[IN ] Whether the access is forced to be at CPL0. */ int (*read_std)(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault, bool system); /* * write_std: Write bytes of standard (non-emulated/special) memory. * Used for descriptor writing. * @addr: [IN ] Linear address to which to write. * @val: [OUT] Value write to memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to write to memory. * @system:[IN ] Whether the access is forced to be at CPL0. */ int (*write_std)(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault, bool system); /* * fetch: Read bytes of standard (non-emulated/special) memory. * Used for instruction fetch. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*fetch)(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* * read_emulated: Read bytes from emulated/special memory area. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_emulated)(struct x86_emulate_ctxt *ctxt, unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* * write_emulated: Write bytes to emulated/special memory area. * @addr: [IN ] Linear address to which to write. * @val: [IN ] Value to write to memory (low-order bytes used as * required). * @bytes: [IN ] Number of bytes to write to memory. */ int (*write_emulated)(struct x86_emulate_ctxt *ctxt, unsigned long addr, const void *val, unsigned int bytes, struct x86_exception *fault); /* * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an * emulated/special memory area. * @addr: [IN ] Linear address to access. * @old: [IN ] Value expected to be current at @addr. * @new: [IN ] Value to write to @addr. * @bytes: [IN ] Number of bytes to access using CMPXCHG. */ int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt, unsigned long addr, const void *old, const void *new, unsigned int bytes, struct x86_exception *fault); void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr); int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt, int size, unsigned short port, void *val, unsigned int count); int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt, int size, unsigned short port, const void *val, unsigned int count); bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector, struct desc_struct *desc, u32 *base3, int seg); void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_struct *desc, u32 base3, int seg); unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt, int seg); void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); void (*halt)(struct x86_emulate_ctxt *ctxt); void (*wbinvd)(struct x86_emulate_ctxt *ctxt); int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); int (*intercept)(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage); bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); bool (*is_smm)(struct x86_emulate_ctxt *ctxt); bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, unsigned int flags); }; /* Type, address-of, and value of an instruction's operand. */ struct operand { enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; unsigned int bytes; unsigned int count; union { unsigned long orig_val; u64 orig_val64; }; union { unsigned long *reg; struct segmented_address { ulong ea; unsigned seg; } mem; unsigned xmm; unsigned mm; } addr; union { unsigned long val; u64 val64; char valptr[sizeof(sse128_t)]; sse128_t vec_val; u64 mm_val; void *data; }; }; struct fetch_cache { u8 data[15]; u8 *ptr; u8 *end; }; struct read_cache { u8 data[1024]; unsigned long pos; unsigned long end; }; /* Execution mode, passed to the emulator. */ enum x86emul_mode { X86EMUL_MODE_REAL, /* Real mode. */ X86EMUL_MODE_VM86, /* Virtual 8086 mode. */ X86EMUL_MODE_PROT16, /* 16-bit protected mode. */ X86EMUL_MODE_PROT32, /* 32-bit protected mode. */ X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */ }; /* * fastop functions are declared as taking a never-defined fastop parameter, * so they can't be called from C directly. */ struct fastop; typedef void (*fastop_t)(struct fastop *); /* * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is * tracked/accessed via _eip, and except for RIP relative addressing, which * also uses _eip, RIP cannot be a register operand nor can it be an operand in * a ModRM or SIB byte. */ #ifdef CONFIG_X86_64 #define NR_EMULATOR_GPRS 16 #else #define NR_EMULATOR_GPRS 8 #endif struct x86_emulate_ctxt { void *vcpu; const struct x86_emulate_ops *ops; /* Register state before/after emulation. */ unsigned long eflags; unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ enum x86emul_mode mode; /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; bool perm_ok; /* do not check permissions if true */ bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; struct x86_exception exception; /* GPA available */ bool gpa_available; gpa_t gpa_val; /* * decode cache */ /* current opcode length in bytes */ u8 opcode_len; u8 b; u8 intercept; u8 op_bytes; u8 ad_bytes; union { int (*execute)(struct x86_emulate_ctxt *ctxt); fastop_t fop; }; int (*check_perm)(struct x86_emulate_ctxt *ctxt); bool rip_relative; u8 rex_prefix; u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ u16 regs_valid; /* bitmaps of registers in _regs[] that have been written */ u16 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; u8 modrm_seg; u8 seg_override; u64 d; unsigned long _eip; /* Here begins the usercopy section. */ struct operand src; struct operand src2; struct operand dst; struct operand memop; unsigned long _regs[NR_EMULATOR_GPRS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; bool is_branch; }; #define KVM_EMULATOR_BUG_ON(cond, ctxt) \ ({ \ int __ret = (cond); \ \ if (WARN_ON_ONCE(__ret)) \ ctxt->ops->vm_bugged(ctxt); \ unlikely(__ret); \ }) /* Repeat String Operation Prefix */ #define REPE_PREFIX 0xf3 #define REPNE_PREFIX 0xf2 /* CPUID vendors */ #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 #define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65 #define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41 #define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574 #define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273 #define X86EMUL_CPUID_VENDOR_HygonGenuine_ebx 0x6f677948 #define X86EMUL_CPUID_VENDOR_HygonGenuine_ecx 0x656e6975 #define X86EMUL_CPUID_VENDOR_HygonGenuine_edx 0x6e65476e #define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547 #define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e #define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69 #define X86EMUL_CPUID_VENDOR_CentaurHauls_ebx 0x746e6543 #define X86EMUL_CPUID_VENDOR_CentaurHauls_ecx 0x736c7561 #define X86EMUL_CPUID_VENDOR_CentaurHauls_edx 0x48727561 static inline bool is_guest_vendor_intel(u32 ebx, u32 ecx, u32 edx) { return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; } static inline bool is_guest_vendor_amd(u32 ebx, u32 ecx, u32 edx) { return (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) || (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx); } static inline bool is_guest_vendor_hygon(u32 ebx, u32 ecx, u32 edx) { return ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx && ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx && edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx; } enum x86_intercept_stage { X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ X86_ICPT_PRE_EXCEPT, X86_ICPT_POST_EXCEPT, X86_ICPT_POST_MEMACCESS, }; enum x86_intercept { x86_intercept_none, x86_intercept_cr_read, x86_intercept_cr_write, x86_intercept_clts, x86_intercept_lmsw, x86_intercept_smsw, x86_intercept_dr_read, x86_intercept_dr_write, x86_intercept_lidt, x86_intercept_sidt, x86_intercept_lgdt, x86_intercept_sgdt, x86_intercept_lldt, x86_intercept_sldt, x86_intercept_ltr, x86_intercept_str, x86_intercept_rdtsc, x86_intercept_rdpmc, x86_intercept_pushf, x86_intercept_popf, x86_intercept_cpuid, x86_intercept_rsm, x86_intercept_iret, x86_intercept_intn, x86_intercept_invd, x86_intercept_pause, x86_intercept_hlt, x86_intercept_invlpg, x86_intercept_invlpga, x86_intercept_vmrun, x86_intercept_vmload, x86_intercept_vmsave, x86_intercept_vmmcall, x86_intercept_stgi, x86_intercept_clgi, x86_intercept_skinit, x86_intercept_rdtscp, x86_intercept_rdpid, x86_intercept_icebp, x86_intercept_wbinvd, x86_intercept_monitor, x86_intercept_mwait, x86_intercept_rdmsr, x86_intercept_wrmsr, x86_intercept_in, x86_intercept_ins, x86_intercept_out, x86_intercept_outs, x86_intercept_xsetbv, nr_x86_intercepts }; /* Host execution mode. */ #if defined(CONFIG_X86_32) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 #elif defined(CONFIG_X86_64) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type); bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_FAILED -1 #define EMULATION_OK 0 #define EMULATION_RESTART 1 #define EMULATION_INTERCEPTED 2 void init_decode_cache(struct x86_emulate_ctxt *ctxt); int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt); static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) nr &= NR_EMULATOR_GPRS - 1; if (!(ctxt->regs_valid & (1 << nr))) { ctxt->regs_valid |= 1 << nr; ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); } return ctxt->_regs[nr]; } static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) { if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) nr &= NR_EMULATOR_GPRS - 1; BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); ctxt->regs_valid |= 1 << nr; ctxt->regs_dirty |= 1 << nr; return &ctxt->_regs[nr]; } static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) { reg_read(ctxt, nr); return reg_write(ctxt, nr); } #endif /* _ASM_X86_KVM_X86_EMULATE_H */
447 156 156 2991 156 156 2990 536 536 146 16 2407 2413 2421 2422 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 // SPDX-License-Identifier: GPL-2.0-only /* * fs/anon_inodes.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * * Thanks to Arnd Bergmann for code review and suggestions. * More changes for Thomas Gleixner suggestions. * */ #include <linux/cred.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/magic.h> #include <linux/anon_inodes.h> #include <linux/pseudo_fs.h> #include <linux/uaccess.h> static struct vfsmount *anon_inode_mnt __ro_after_init; static struct inode *anon_inode_inode __ro_after_init; /* * anon_inodefs_dname() is called from d_path(). */ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "anon_inode:%s", dentry->d_name.name); } static const struct dentry_operations anon_inodefs_dentry_operations = { .d_dname = anon_inodefs_dname, }; static int anon_inodefs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC); if (!ctx) return -ENOMEM; ctx->dops = &anon_inodefs_dentry_operations; return 0; } static struct file_system_type anon_inode_fs_type = { .name = "anon_inodefs", .init_fs_context = anon_inodefs_init_fs_context, .kill_sb = kill_anon_super, }; static struct inode *anon_inode_make_secure_inode( const char *name, const struct inode *context_inode) { struct inode *inode; const struct qstr qname = QSTR_INIT(name, strlen(name)); int error; inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); if (IS_ERR(inode)) return inode; inode->i_flags &= ~S_PRIVATE; error = security_inode_init_security_anon(inode, &qname, context_inode); if (error) { iput(inode); return ERR_PTR(error); } return inode; } static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, bool make_inode) { struct inode *inode; struct file *file; if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); if (make_inode) { inode = anon_inode_make_secure_inode(name, context_inode); if (IS_ERR(inode)) { file = ERR_CAST(inode); goto err; } } else { inode = anon_inode_inode; if (IS_ERR(inode)) { file = ERR_PTR(-ENODEV); goto err; } /* * We know the anon_inode inode count is always * greater than zero, so ihold() is safe. */ ihold(inode); } file = alloc_file_pseudo(inode, anon_inode_mnt, name, flags & (O_ACCMODE | O_NONBLOCK), fops); if (IS_ERR(file)) goto err_iput; file->f_mapping = inode->i_mapping; file->private_data = priv; return file; err_iput: iput(inode); err: module_put(fops->owner); return file; } /** * anon_inode_getfile - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" * of the file * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. * All the files created with anon_inode_getfile() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry * setup. Returns the newly created file* or an error pointer. */ struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags) { return __anon_inode_getfile(name, fops, priv, flags, NULL, false); } EXPORT_SYMBOL_GPL(anon_inode_getfile); /** * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new * !S_PRIVATE anon inode rather than reuse the * singleton anon inode and calls the * inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * @context_inode: * [in] the logical relationship with the new inode (optional) * * Create a new anonymous inode and file pair. This can be done for two * reasons: * * - for the inode to have its own security context, so that LSMs can enforce * policy on the inode's creation; * * - if the caller needs a unique inode, for example in order to customize * the size returned by fstat() * * The LSM may use @context_inode in inode_init_security_anon(), but a * reference to it is not held. * * Returns the newly created file* or an error pointer. */ struct file *anon_inode_create_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) { return __anon_inode_getfile(name, fops, priv, flags, context_inode, true); } EXPORT_SYMBOL_GPL(anon_inode_create_getfile); static int __anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, bool make_inode) { int error, fd; struct file *file; error = get_unused_fd_flags(flags); if (error < 0) return error; fd = error; file = __anon_inode_getfile(name, fops, priv, flags, context_inode, make_inode); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_put_unused_fd; } fd_install(fd, file); return fd; err_put_unused_fd: put_unused_fd(fd); return error; } /** * anon_inode_getfd - creates a new file instance by hooking it up to * an anonymous inode and a dentry that describe * the "class" of the file * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * * Creates a new file by hooking it on a single inode. This is * useful for files that do not need to have a full-fledged inode in * order to operate correctly. All the files created with * anon_inode_getfd() will use the same singleton inode, reducing * memory use and avoiding code duplication for the file/inode/dentry * setup. Returns a newly created file descriptor or an error code. */ int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags) { return __anon_inode_getfd(name, fops, priv, flags, NULL, false); } EXPORT_SYMBOL_GPL(anon_inode_getfd); /** * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls * the inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * @context_inode: * [in] the logical relationship with the new inode (optional) * * Create a new anonymous inode and file pair. This can be done for two * reasons: * * - for the inode to have its own security context, so that LSMs can enforce * policy on the inode's creation; * * - if the caller needs a unique inode, for example in order to customize * the size returned by fstat() * * The LSM may use @context_inode in inode_init_security_anon(), but a * reference to it is not held. * * Returns a newly created file descriptor or an error code. */ int anon_inode_create_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) { return __anon_inode_getfd(name, fops, priv, flags, context_inode, true); } static int __init anon_inode_init(void) { anon_inode_mnt = kern_mount(&anon_inode_fs_type); if (IS_ERR(anon_inode_mnt)) panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt)); anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); if (IS_ERR(anon_inode_inode)) panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode)); return 0; } fs_initcall(anon_inode_init);
123 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MBCACHE_H #define _LINUX_MBCACHE_H #include <linux/hash.h> #include <linux/list_bl.h> #include <linux/list.h> #include <linux/atomic.h> #include <linux/fs.h> struct mb_cache; /* Cache entry flags */ enum { MBE_REFERENCED_B = 0, MBE_REUSABLE_B }; struct mb_cache_entry { /* List of entries in cache - protected by cache->c_list_lock */ struct list_head e_list; /* * Hash table list - protected by hash chain bitlock. The entry is * guaranteed to be hashed while e_refcnt > 0. */ struct hlist_bl_node e_hash_list; /* * Entry refcount. Once it reaches zero, entry is unhashed and freed. * While refcount > 0, the entry is guaranteed to stay in the hash and * e.g. mb_cache_entry_try_delete() will fail. */ atomic_t e_refcnt; /* Key in hash - stable during lifetime of the entry */ u32 e_key; unsigned long e_flags; /* User provided value - stable during lifetime of the entry */ u64 e_value; }; struct mb_cache *mb_cache_create(int bucket_bits); void mb_cache_destroy(struct mb_cache *cache); int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, u64 value, bool reusable); void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry); void mb_cache_entry_wait_unused(struct mb_cache_entry *entry); static inline void mb_cache_entry_put(struct mb_cache *cache, struct mb_cache_entry *entry) { unsigned int cnt = atomic_dec_return(&entry->e_refcnt); if (cnt > 0) { if (cnt <= 2) wake_up_var(&entry->e_refcnt); return; } __mb_cache_entry_free(cache, entry); } struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key); struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry); void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry); #endif /* _LINUX_MBCACHE_H */
1459 1453 1481 1459 1471 1453 2054 30 32 302 295 507 517 508 518 2682 855 1657 1669 1605 30 32 30 28 32 33 25 28 230 511 8 278 269 2298 277 2772 2776 1987 2456 515 304 219 180 303 33 279 279 7 1 265 264 1 265 265 265 1 1 7 7 7 522 766 521 2758 277 2514 350 164 2 264 83 83 59 25 336 336 337 264 231 33 33 348 346 35 1594 929 1594 860 3 34 3 2 2 40 11 2768 2763 349 2460 2786 20 2782 2792 2769 2807 292 292 1 1 1 292 1 4 46 256 1 1 1 7 8 8 8 8 8 285 301 295 6 5 296 6 11 292 285 285 26 2 17 7 3 2 1 3 3 1 1 3 14 9 14 13 6 13 16 16 16 16 1 16 14 6 5 17 8 3 13 6 16 15 9 9 4 9 9 9 9 8 1 6 5 438 438 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Generic Netlink Family * * Authors: Jamal Hadi Salim * Thomas Graf <tgraf@suug.ch> * Johannes Berg <johannes@sipsolutions.net> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/string_helpers.h> #include <linux/skbuff.h> #include <linux/mutex.h> #include <linux/bitmap.h> #include <linux/rwsem.h> #include <linux/idr.h> #include <net/sock.h> #include <net/genetlink.h> static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */ static DECLARE_RWSEM(cb_lock); atomic_t genl_sk_destructing_cnt = ATOMIC_INIT(0); DECLARE_WAIT_QUEUE_HEAD(genl_sk_destructing_waitq); void genl_lock(void) { mutex_lock(&genl_mutex); } EXPORT_SYMBOL(genl_lock); void genl_unlock(void) { mutex_unlock(&genl_mutex); } EXPORT_SYMBOL(genl_unlock); static void genl_lock_all(void) { down_write(&cb_lock); genl_lock(); } static void genl_unlock_all(void) { genl_unlock(); up_write(&cb_lock); } static void genl_op_lock(const struct genl_family *family) { if (!family->parallel_ops) genl_lock(); } static void genl_op_unlock(const struct genl_family *family) { if (!family->parallel_ops) genl_unlock(); } static DEFINE_IDR(genl_fam_idr); /* * Bitmap of multicast groups that are currently in use. * * To avoid an allocation at boot of just one unsigned long, * declare it global instead. * Bit 0 is marked as already used since group 0 is invalid. * Bit 1 is marked as already used since the drop-monitor code * abuses the API and thinks it can statically use group 1. * That group will typically conflict with other groups that * any proper users use. * Bit 16 is marked as used since it's used for generic netlink * and the code no longer marks pre-reserved IDs as used. * Bit 17 is marked as already used since the VFS quota code * also abused this API and relied on family == group ID, we * cater to that by giving it a static family and group ID. * Bit 18 is marked as already used since the PMCRAID driver * did the same thing as the VFS quota code (maybe copied?) */ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) | BIT(GENL_ID_VFS_DQUOT) | BIT(GENL_ID_PMCRAID); static unsigned long *mc_groups = &mc_group_start; static unsigned long mc_groups_longs = 1; /* We need the last attribute with non-zero ID therefore a 2-entry array */ static struct nla_policy genl_policy_reject_all[] = { { .type = NLA_REJECT }, { .type = NLA_REJECT }, }; static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id); static void genl_op_fill_in_reject_policy(const struct genl_family *family, struct genl_ops *op) { BUILD_BUG_ON(ARRAY_SIZE(genl_policy_reject_all) - 1 != 1); if (op->policy || op->cmd < family->resv_start_op) return; op->policy = genl_policy_reject_all; op->maxattr = 1; } static void genl_op_fill_in_reject_policy_split(const struct genl_family *family, struct genl_split_ops *op) { if (op->policy) return; op->policy = genl_policy_reject_all; op->maxattr = 1; } static const struct genl_family *genl_family_find_byid(unsigned int id) { return idr_find(&genl_fam_idr, id); } static const struct genl_family *genl_family_find_byname(char *name) { const struct genl_family *family; unsigned int id; idr_for_each_entry(&genl_fam_idr, family, id) if (strcmp(family->name, name) == 0) return family; return NULL; } struct genl_op_iter { const struct genl_family *family; struct genl_split_ops doit; struct genl_split_ops dumpit; int cmd_idx; int entry_idx; u32 cmd; u8 flags; }; static void genl_op_from_full(const struct genl_family *family, unsigned int i, struct genl_ops *op) { *op = family->ops[i]; if (!op->maxattr) op->maxattr = family->maxattr; if (!op->policy) op->policy = family->policy; genl_op_fill_in_reject_policy(family, op); } static int genl_get_cmd_full(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; for (i = 0; i < family->n_ops; i++) if (family->ops[i].cmd == cmd) { genl_op_from_full(family, i, op); return 0; } return -ENOENT; } static void genl_op_from_small(const struct genl_family *family, unsigned int i, struct genl_ops *op) { memset(op, 0, sizeof(*op)); op->doit = family->small_ops[i].doit; op->dumpit = family->small_ops[i].dumpit; op->cmd = family->small_ops[i].cmd; op->internal_flags = family->small_ops[i].internal_flags; op->flags = family->small_ops[i].flags; op->validate = family->small_ops[i].validate; op->maxattr = family->maxattr; op->policy = family->policy; genl_op_fill_in_reject_policy(family, op); } static int genl_get_cmd_small(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; for (i = 0; i < family->n_small_ops; i++) if (family->small_ops[i].cmd == cmd) { genl_op_from_small(family, i, op); return 0; } return -ENOENT; } static void genl_op_from_split(struct genl_op_iter *iter) { const struct genl_family *family = iter->family; int i, cnt = 0; i = iter->entry_idx - family->n_ops - family->n_small_ops; if (family->split_ops[i + cnt].flags & GENL_CMD_CAP_DO) { iter->doit = family->split_ops[i + cnt]; genl_op_fill_in_reject_policy_split(family, &iter->doit); cnt++; } else { memset(&iter->doit, 0, sizeof(iter->doit)); } if (i + cnt < family->n_split_ops && family->split_ops[i + cnt].flags & GENL_CMD_CAP_DUMP && (!cnt || family->split_ops[i + cnt].cmd == iter->doit.cmd)) { iter->dumpit = family->split_ops[i + cnt]; genl_op_fill_in_reject_policy_split(family, &iter->dumpit); cnt++; } else { memset(&iter->dumpit, 0, sizeof(iter->dumpit)); } WARN_ON(!cnt); iter->entry_idx += cnt; } static int genl_get_cmd_split(u32 cmd, u8 flag, const struct genl_family *family, struct genl_split_ops *op) { int i; for (i = 0; i < family->n_split_ops; i++) if (family->split_ops[i].cmd == cmd && family->split_ops[i].flags & flag) { *op = family->split_ops[i]; return 0; } return -ENOENT; } static int genl_cmd_full_to_split(struct genl_split_ops *op, const struct genl_family *family, const struct genl_ops *full, u8 flags) { if ((flags & GENL_CMD_CAP_DO && !full->doit) || (flags & GENL_CMD_CAP_DUMP && !full->dumpit)) { memset(op, 0, sizeof(*op)); return -ENOENT; } if (flags & GENL_CMD_CAP_DUMP) { op->start = full->start; op->dumpit = full->dumpit; op->done = full->done; } else { op->pre_doit = family->pre_doit; op->doit = full->doit; op->post_doit = family->post_doit; } if (flags & GENL_CMD_CAP_DUMP && full->validate & GENL_DONT_VALIDATE_DUMP) { op->policy = NULL; op->maxattr = 0; } else { op->policy = full->policy; op->maxattr = full->maxattr; } op->cmd = full->cmd; op->internal_flags = full->internal_flags; op->flags = full->flags; op->validate = full->validate; /* Make sure flags include the GENL_CMD_CAP_DO / GENL_CMD_CAP_DUMP */ op->flags |= flags; return 0; } /* Must make sure that op is initialized to 0 on failure */ static int genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family, struct genl_split_ops *op) { struct genl_ops full; int err; err = genl_get_cmd_full(cmd, family, &full); if (err == -ENOENT) err = genl_get_cmd_small(cmd, family, &full); /* Found one of legacy forms */ if (err == 0) return genl_cmd_full_to_split(op, family, &full, flags); err = genl_get_cmd_split(cmd, flags, family, op); if (err) memset(op, 0, sizeof(*op)); return err; } /* For policy dumping only, get ops of both do and dump. * Fail if both are missing, genl_get_cmd() will zero-init in case of failure. */ static int genl_get_cmd_both(u32 cmd, const struct genl_family *family, struct genl_split_ops *doit, struct genl_split_ops *dumpit) { int err1, err2; err1 = genl_get_cmd(cmd, GENL_CMD_CAP_DO, family, doit); err2 = genl_get_cmd(cmd, GENL_CMD_CAP_DUMP, family, dumpit); return err1 && err2 ? -ENOENT : 0; } static bool genl_op_iter_init(const struct genl_family *family, struct genl_op_iter *iter) { iter->family = family; iter->cmd_idx = 0; iter->entry_idx = 0; iter->flags = 0; return iter->family->n_ops + iter->family->n_small_ops + iter->family->n_split_ops; } static bool genl_op_iter_next(struct genl_op_iter *iter) { const struct genl_family *family = iter->family; bool legacy_op = true; struct genl_ops op; if (iter->entry_idx < family->n_ops) { genl_op_from_full(family, iter->entry_idx, &op); } else if (iter->entry_idx < family->n_ops + family->n_small_ops) { genl_op_from_small(family, iter->entry_idx - family->n_ops, &op); } else if (iter->entry_idx < family->n_ops + family->n_small_ops + family->n_split_ops) { legacy_op = false; /* updates entry_idx */ genl_op_from_split(iter); } else { return false; } iter->cmd_idx++; if (legacy_op) { iter->entry_idx++; genl_cmd_full_to_split(&iter->doit, family, &op, GENL_CMD_CAP_DO); genl_cmd_full_to_split(&iter->dumpit, family, &op, GENL_CMD_CAP_DUMP); } iter->cmd = iter->doit.cmd | iter->dumpit.cmd; iter->flags = iter->doit.flags | iter->dumpit.flags; return true; } static void genl_op_iter_copy(struct genl_op_iter *dst, struct genl_op_iter *src) { *dst = *src; } static unsigned int genl_op_iter_idx(struct genl_op_iter *iter) { return iter->cmd_idx; } static int genl_allocate_reserve_groups(int n_groups, int *first_id) { unsigned long *new_groups; int start = 0; int i; int id; bool fits; do { if (start == 0) id = find_first_zero_bit(mc_groups, mc_groups_longs * BITS_PER_LONG); else id = find_next_zero_bit(mc_groups, mc_groups_longs * BITS_PER_LONG, start); fits = true; for (i = id; i < min_t(int, id + n_groups, mc_groups_longs * BITS_PER_LONG); i++) { if (test_bit(i, mc_groups)) { start = i; fits = false; break; } } if (id + n_groups > mc_groups_longs * BITS_PER_LONG) { unsigned long new_longs = mc_groups_longs + BITS_TO_LONGS(n_groups); size_t nlen = new_longs * sizeof(unsigned long); if (mc_groups == &mc_group_start) { new_groups = kzalloc(nlen, GFP_KERNEL); if (!new_groups) return -ENOMEM; mc_groups = new_groups; *mc_groups = mc_group_start; } else { new_groups = krealloc(mc_groups, nlen, GFP_KERNEL); if (!new_groups) return -ENOMEM; mc_groups = new_groups; for (i = 0; i < BITS_TO_LONGS(n_groups); i++) mc_groups[mc_groups_longs + i] = 0; } mc_groups_longs = new_longs; } } while (!fits); for (i = id; i < id + n_groups; i++) set_bit(i, mc_groups); *first_id = id; return 0; } static struct genl_family genl_ctrl; static int genl_validate_assign_mc_groups(struct genl_family *family) { int first_id; int n_groups = family->n_mcgrps; int err = 0, i; bool groups_allocated = false; if (!n_groups) return 0; for (i = 0; i < n_groups; i++) { const struct genl_multicast_group *grp = &family->mcgrps[i]; if (WARN_ON(grp->name[0] == '\0')) return -EINVAL; if (WARN_ON(!string_is_terminated(grp->name, GENL_NAMSIZ))) return -EINVAL; } /* special-case our own group and hacks */ if (family == &genl_ctrl) { first_id = GENL_ID_CTRL; BUG_ON(n_groups != 1); } else if (strcmp(family->name, "NET_DM") == 0) { first_id = 1; BUG_ON(n_groups != 1); } else if (family->id == GENL_ID_VFS_DQUOT) { first_id = GENL_ID_VFS_DQUOT; BUG_ON(n_groups != 1); } else if (family->id == GENL_ID_PMCRAID) { first_id = GENL_ID_PMCRAID; BUG_ON(n_groups != 1); } else { groups_allocated = true; err = genl_allocate_reserve_groups(n_groups, &first_id); if (err) return err; } family->mcgrp_offset = first_id; /* if still initializing, can't and don't need to realloc bitmaps */ if (!init_net.genl_sock) return 0; if (family->netnsok) { struct net *net; netlink_table_grab(); rcu_read_lock(); for_each_net_rcu(net) { err = __netlink_change_ngroups(net->genl_sock, mc_groups_longs * BITS_PER_LONG); if (err) { /* * No need to roll back, can only fail if * memory allocation fails and then the * number of _possible_ groups has been * increased on some sockets which is ok. */ break; } } rcu_read_unlock(); netlink_table_ungrab(); } else { err = netlink_change_ngroups(init_net.genl_sock, mc_groups_longs * BITS_PER_LONG); } if (groups_allocated && err) { for (i = 0; i < family->n_mcgrps; i++) clear_bit(family->mcgrp_offset + i, mc_groups); } return err; } static void genl_unregister_mc_groups(const struct genl_family *family) { struct net *net; int i; netlink_table_grab(); rcu_read_lock(); for_each_net_rcu(net) { for (i = 0; i < family->n_mcgrps; i++) __netlink_clear_multicast_users( net->genl_sock, family->mcgrp_offset + i); } rcu_read_unlock(); netlink_table_ungrab(); for (i = 0; i < family->n_mcgrps; i++) { int grp_id = family->mcgrp_offset + i; if (grp_id != 1) clear_bit(grp_id, mc_groups); genl_ctrl_event(CTRL_CMD_DELMCAST_GRP, family, &family->mcgrps[i], grp_id); } } static bool genl_split_op_check(const struct genl_split_ops *op) { if (WARN_ON(hweight8(op->flags & (GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP)) != 1)) return true; return false; } static int genl_validate_ops(const struct genl_family *family) { struct genl_op_iter i, j; unsigned int s; if (WARN_ON(family->n_ops && !family->ops) || WARN_ON(family->n_small_ops && !family->small_ops) || WARN_ON(family->n_split_ops && !family->split_ops)) return -EINVAL; for (genl_op_iter_init(family, &i); genl_op_iter_next(&i); ) { if (!(i.flags & (GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP))) return -EINVAL; if (WARN_ON(i.cmd >= family->resv_start_op && (i.doit.validate || i.dumpit.validate))) return -EINVAL; genl_op_iter_copy(&j, &i); while (genl_op_iter_next(&j)) { if (i.cmd == j.cmd) return -EINVAL; } } if (family->n_split_ops) { if (genl_split_op_check(&family->split_ops[0])) return -EINVAL; } for (s = 1; s < family->n_split_ops; s++) { const struct genl_split_ops *a, *b; a = &family->split_ops[s - 1]; b = &family->split_ops[s]; if (genl_split_op_check(b)) return -EINVAL; /* Check sort order */ if (a->cmd < b->cmd) { continue; } else if (a->cmd > b->cmd) { WARN_ON(1); return -EINVAL; } if (a->internal_flags != b->internal_flags || ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP))) { WARN_ON(1); return -EINVAL; } if ((a->flags & GENL_CMD_CAP_DO) && (b->flags & GENL_CMD_CAP_DUMP)) continue; WARN_ON(1); return -EINVAL; } return 0; } static void *genl_sk_priv_alloc(struct genl_family *family) { void *priv; priv = kzalloc(family->sock_priv_size, GFP_KERNEL); if (!priv) return ERR_PTR(-ENOMEM); if (family->sock_priv_init) family->sock_priv_init(priv); return priv; } static void genl_sk_priv_free(const struct genl_family *family, void *priv) { if (family->sock_priv_destroy) family->sock_priv_destroy(priv); kfree(priv); } static int genl_sk_privs_alloc(struct genl_family *family) { if (!family->sock_priv_size) return 0; family->sock_privs = kzalloc(sizeof(*family->sock_privs), GFP_KERNEL); if (!family->sock_privs) return -ENOMEM; xa_init(family->sock_privs); return 0; } static void genl_sk_privs_free(const struct genl_family *family) { unsigned long id; void *priv; if (!family->sock_priv_size) return; xa_for_each(family->sock_privs, id, priv) genl_sk_priv_free(family, priv); xa_destroy(family->sock_privs); kfree(family->sock_privs); } static void genl_sk_priv_free_by_sock(struct genl_family *family, struct sock *sk) { void *priv; if (!family->sock_priv_size) return; priv = xa_erase(family->sock_privs, (unsigned long) sk); if (!priv) return; genl_sk_priv_free(family, priv); } static void genl_release(struct sock *sk, unsigned long *groups) { struct genl_family *family; unsigned int id; down_read(&cb_lock); idr_for_each_entry(&genl_fam_idr, family, id) genl_sk_priv_free_by_sock(family, sk); up_read(&cb_lock); } /** * __genl_sk_priv_get - Get family private pointer for socket, if exists * * @family: family * @sk: socket * * Lookup a private memory for a Generic netlink family and specified socket. * * Caller should make sure this is called in RCU read locked section. * * Return: valid pointer on success, otherwise negative error value * encoded by ERR_PTR(), NULL in case priv does not exist. */ void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk) { if (WARN_ON_ONCE(!family->sock_privs)) return ERR_PTR(-EINVAL); return xa_load(family->sock_privs, (unsigned long) sk); } /** * genl_sk_priv_get - Get family private pointer for socket * * @family: family * @sk: socket * * Lookup a private memory for a Generic netlink family and specified socket. * Allocate the private memory in case it was not already done. * * Return: valid pointer on success, otherwise negative error value * encoded by ERR_PTR(). */ void *genl_sk_priv_get(struct genl_family *family, struct sock *sk) { void *priv, *old_priv; priv = __genl_sk_priv_get(family, sk); if (priv) return priv; /* priv for the family does not exist so far, create it. */ priv = genl_sk_priv_alloc(family); if (IS_ERR(priv)) return ERR_CAST(priv); old_priv = xa_cmpxchg(family->sock_privs, (unsigned long) sk, NULL, priv, GFP_KERNEL); if (old_priv) { genl_sk_priv_free(family, priv); if (xa_is_err(old_priv)) return ERR_PTR(xa_err(old_priv)); /* Race happened, priv for the socket was already inserted. */ return old_priv; } return priv; } /** * genl_register_family - register a generic netlink family * @family: generic netlink family * * Registers the specified family after validating it first. Only one * family may be registered with the same family name or identifier. * * The family's ops, multicast groups and module pointer must already * be assigned. * * Return 0 on success or a negative error code. */ int genl_register_family(struct genl_family *family) { int err, i; int start = GENL_START_ALLOC, end = GENL_MAX_ID; err = genl_validate_ops(family); if (err) return err; genl_lock_all(); if (genl_family_find_byname(family->name)) { err = -EEXIST; goto errout_locked; } err = genl_sk_privs_alloc(family); if (err) goto errout_locked; /* * Sadly, a few cases need to be special-cased * due to them having previously abused the API * and having used their family ID also as their * multicast group ID, so we use reserved IDs * for both to be sure we can do that mapping. */ if (family == &genl_ctrl) { /* and this needs to be special for initial family lookups */ start = end = GENL_ID_CTRL; } else if (strcmp(family->name, "pmcraid") == 0) { start = end = GENL_ID_PMCRAID; } else if (strcmp(family->name, "VFS_DQUOT") == 0) { start = end = GENL_ID_VFS_DQUOT; } family->id = idr_alloc_cyclic(&genl_fam_idr, family, start, end + 1, GFP_KERNEL); if (family->id < 0) { err = family->id; goto errout_sk_privs_free; } err = genl_validate_assign_mc_groups(family); if (err) goto errout_remove; genl_unlock_all(); /* send all events */ genl_ctrl_event(CTRL_CMD_NEWFAMILY, family, NULL, 0); for (i = 0; i < family->n_mcgrps; i++) genl_ctrl_event(CTRL_CMD_NEWMCAST_GRP, family, &family->mcgrps[i], family->mcgrp_offset + i); return 0; errout_remove: idr_remove(&genl_fam_idr, family->id); errout_sk_privs_free: genl_sk_privs_free(family); errout_locked: genl_unlock_all(); return err; } EXPORT_SYMBOL(genl_register_family); /** * genl_unregister_family - unregister generic netlink family * @family: generic netlink family * * Unregisters the specified family. * * Returns 0 on success or a negative error code. */ int genl_unregister_family(const struct genl_family *family) { genl_lock_all(); if (!genl_family_find_byid(family->id)) { genl_unlock_all(); return -ENOENT; } genl_unregister_mc_groups(family); idr_remove(&genl_fam_idr, family->id); up_write(&cb_lock); wait_event(genl_sk_destructing_waitq, atomic_read(&genl_sk_destructing_cnt) == 0); genl_sk_privs_free(family); genl_unlock(); genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); return 0; } EXPORT_SYMBOL(genl_unregister_family); /** * genlmsg_put - Add generic netlink header to netlink message * @skb: socket buffer holding the message * @portid: netlink portid the message is addressed to * @seq: sequence number (usually the one of the sender) * @family: generic netlink family * @flags: netlink message flags * @cmd: generic netlink command * * Returns pointer to user specific header */ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, const struct genl_family *family, int flags, u8 cmd) { struct nlmsghdr *nlh; struct genlmsghdr *hdr; nlh = nlmsg_put(skb, portid, seq, family->id, GENL_HDRLEN + family->hdrsize, flags); if (nlh == NULL) return NULL; hdr = nlmsg_data(nlh); hdr->cmd = cmd; hdr->version = family->version; hdr->reserved = 0; return (char *) hdr + GENL_HDRLEN; } EXPORT_SYMBOL(genlmsg_put); static struct genl_dumpit_info *genl_dumpit_info_alloc(void) { return kmalloc(sizeof(struct genl_dumpit_info), GFP_KERNEL); } static void genl_dumpit_info_free(const struct genl_dumpit_info *info) { kfree(info); } static struct nlattr ** genl_family_rcv_msg_attrs_parse(const struct genl_family *family, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_split_ops *ops, int hdrlen, enum genl_validate_flags no_strict_flag) { enum netlink_validation validate = ops->validate & no_strict_flag ? NL_VALIDATE_LIBERAL : NL_VALIDATE_STRICT; struct nlattr **attrbuf; int err; if (!ops->maxattr) return NULL; attrbuf = kmalloc_array(ops->maxattr + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!attrbuf) return ERR_PTR(-ENOMEM); err = __nlmsg_parse(nlh, hdrlen, attrbuf, ops->maxattr, ops->policy, validate, extack); if (err) { kfree(attrbuf); return ERR_PTR(err); } return attrbuf; } static void genl_family_rcv_msg_attrs_free(struct nlattr **attrbuf) { kfree(attrbuf); } struct genl_start_context { const struct genl_family *family; struct nlmsghdr *nlh; struct netlink_ext_ack *extack; const struct genl_split_ops *ops; int hdrlen; }; static int genl_start(struct netlink_callback *cb) { struct genl_start_context *ctx = cb->data; const struct genl_split_ops *ops; struct genl_dumpit_info *info; struct nlattr **attrs = NULL; int rc = 0; ops = ctx->ops; if (!(ops->validate & GENL_DONT_VALIDATE_DUMP) && ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen)) return -EINVAL; attrs = genl_family_rcv_msg_attrs_parse(ctx->family, ctx->nlh, ctx->extack, ops, ctx->hdrlen, GENL_DONT_VALIDATE_DUMP_STRICT); if (IS_ERR(attrs)) return PTR_ERR(attrs); info = genl_dumpit_info_alloc(); if (!info) { genl_family_rcv_msg_attrs_free(attrs); return -ENOMEM; } info->op = *ops; info->info.family = ctx->family; info->info.snd_seq = cb->nlh->nlmsg_seq; info->info.snd_portid = NETLINK_CB(cb->skb).portid; info->info.nlhdr = cb->nlh; info->info.genlhdr = nlmsg_data(cb->nlh); info->info.attrs = attrs; genl_info_net_set(&info->info, sock_net(cb->skb->sk)); info->info.extack = cb->extack; memset(&info->info.user_ptr, 0, sizeof(info->info.user_ptr)); cb->data = info; if (ops->start) { genl_op_lock(ctx->family); rc = ops->start(cb); genl_op_unlock(ctx->family); } if (rc) { genl_family_rcv_msg_attrs_free(info->info.attrs); genl_dumpit_info_free(info); cb->data = NULL; } return rc; } static int genl_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct genl_dumpit_info *dump_info = cb->data; const struct genl_split_ops *ops = &dump_info->op; struct genl_info *info = &dump_info->info; int rc; info->extack = cb->extack; genl_op_lock(info->family); rc = ops->dumpit(skb, cb); genl_op_unlock(info->family); return rc; } static int genl_done(struct netlink_callback *cb) { struct genl_dumpit_info *dump_info = cb->data; const struct genl_split_ops *ops = &dump_info->op; struct genl_info *info = &dump_info->info; int rc = 0; info->extack = cb->extack; if (ops->done) { genl_op_lock(info->family); rc = ops->done(cb); genl_op_unlock(info->family); } genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(dump_info); return rc; } static int genl_family_rcv_msg_dumpit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_split_ops *ops, int hdrlen, struct net *net) { struct genl_start_context ctx; struct netlink_dump_control c = { .module = family->module, .data = &ctx, .start = genl_start, .dump = genl_dumpit, .done = genl_done, .extack = extack, }; int err; ctx.family = family; ctx.nlh = nlh; ctx.extack = extack; ctx.ops = ops; ctx.hdrlen = hdrlen; genl_op_unlock(family); err = __netlink_dump_start(net->genl_sock, skb, nlh, &c); genl_op_lock(family); return err; } static int genl_family_rcv_msg_doit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, const struct genl_split_ops *ops, int hdrlen, struct net *net) { struct nlattr **attrbuf; struct genl_info info; int err; attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, ops, hdrlen, GENL_DONT_VALIDATE_STRICT); if (IS_ERR(attrbuf)) return PTR_ERR(attrbuf); info.snd_seq = nlh->nlmsg_seq; info.snd_portid = NETLINK_CB(skb).portid; info.family = family; info.nlhdr = nlh; info.genlhdr = nlmsg_data(nlh); info.attrs = attrbuf; info.extack = extack; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); if (ops->pre_doit) { err = ops->pre_doit(ops, skb, &info); if (err) goto out; } err = ops->doit(skb, &info); if (ops->post_doit) ops->post_doit(ops, skb, &info); out: genl_family_rcv_msg_attrs_free(attrbuf); return err; } static int genl_header_check(const struct genl_family *family, struct nlmsghdr *nlh, struct genlmsghdr *hdr, struct netlink_ext_ack *extack) { u16 flags; /* Only for commands added after we started validating */ if (hdr->cmd < family->resv_start_op) return 0; if (hdr->reserved) { NL_SET_ERR_MSG(extack, "genlmsghdr.reserved field is not 0"); return -EINVAL; } /* Old netlink flags have pretty loose semantics, allow only the flags * consumed by the core where we can enforce the meaning. */ flags = nlh->nlmsg_flags; if ((flags & NLM_F_DUMP) == NLM_F_DUMP) /* DUMP is 2 bits */ flags &= ~NLM_F_DUMP; if (flags & ~(NLM_F_REQUEST | NLM_F_ACK | NLM_F_ECHO)) { NL_SET_ERR_MSG(extack, "ambiguous or reserved bits set in nlmsg_flags"); return -EINVAL; } return 0; } static int genl_family_rcv_msg(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct genlmsghdr *hdr = nlmsg_data(nlh); struct genl_split_ops op; int hdrlen; u8 flags; /* this family doesn't exist in this netns */ if (!family->netnsok && !net_eq(net, &init_net)) return -ENOENT; hdrlen = GENL_HDRLEN + family->hdrsize; if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; if (genl_header_check(family, nlh, hdr, extack)) return -EINVAL; flags = (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP ? GENL_CMD_CAP_DUMP : GENL_CMD_CAP_DO; if (genl_get_cmd(hdr->cmd, flags, family, &op)) return -EOPNOTSUPP; if ((op.flags & GENL_ADMIN_PERM) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if ((op.flags & GENL_UNS_ADMIN_PERM) && !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (flags & GENL_CMD_CAP_DUMP) return genl_family_rcv_msg_dumpit(family, skb, nlh, extack, &op, hdrlen, net); else return genl_family_rcv_msg_doit(family, skb, nlh, extack, &op, hdrlen, net); } static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { const struct genl_family *family; int err; family = genl_family_find_byid(nlh->nlmsg_type); if (family == NULL) return -ENOENT; genl_op_lock(family); err = genl_family_rcv_msg(family, skb, nlh, extack); genl_op_unlock(family); return err; } static void genl_rcv(struct sk_buff *skb) { down_read(&cb_lock); netlink_rcv_skb(skb, &genl_rcv_msg); up_read(&cb_lock); } /************************************************************************** * Controller **************************************************************************/ static struct genl_family genl_ctrl; static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { struct genl_op_iter i; void *hdr; hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) return -1; if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id) || nla_put_u32(skb, CTRL_ATTR_VERSION, family->version) || nla_put_u32(skb, CTRL_ATTR_HDRSIZE, family->hdrsize) || nla_put_u32(skb, CTRL_ATTR_MAXATTR, family->maxattr)) goto nla_put_failure; if (genl_op_iter_init(family, &i)) { struct nlattr *nla_ops; nla_ops = nla_nest_start_noflag(skb, CTRL_ATTR_OPS); if (nla_ops == NULL) goto nla_put_failure; while (genl_op_iter_next(&i)) { struct nlattr *nest; u32 op_flags; op_flags = i.flags; if (i.doit.policy || i.dumpit.policy) op_flags |= GENL_CMD_CAP_HASPOL; nest = nla_nest_start_noflag(skb, genl_op_iter_idx(&i)); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_OP_ID, i.cmd) || nla_put_u32(skb, CTRL_ATTR_OP_FLAGS, op_flags)) goto nla_put_failure; nla_nest_end(skb, nest); } nla_nest_end(skb, nla_ops); } if (family->n_mcgrps) { struct nlattr *nla_grps; int i; nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; for (i = 0; i < family->n_mcgrps; i++) { struct nlattr *nest; const struct genl_multicast_group *grp; grp = &family->mcgrps[i]; nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, family->mcgrp_offset + i) || nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME, grp->name)) goto nla_put_failure; nla_nest_end(skb, nest); } nla_nest_end(skb, nla_grps); } genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ctrl_fill_mcgrp_info(const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; struct nlattr *nla_grps; struct nlattr *nest; hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); if (hdr == NULL) return -1; if (nla_put_string(skb, CTRL_ATTR_FAMILY_NAME, family->name) || nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, family->id)) goto nla_put_failure; nla_grps = nla_nest_start_noflag(skb, CTRL_ATTR_MCAST_GROUPS); if (nla_grps == NULL) goto nla_put_failure; nest = nla_nest_start_noflag(skb, 1); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, CTRL_ATTR_MCAST_GRP_ID, grp_id) || nla_put_string(skb, CTRL_ATTR_MCAST_GRP_NAME, grp->name)) goto nla_put_failure; nla_nest_end(skb, nest); nla_nest_end(skb, nla_grps); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) { int n = 0; struct genl_family *rt; struct net *net = sock_net(skb->sk); int fams_to_skip = cb->args[0]; unsigned int id; idr_for_each_entry(&genl_fam_idr, rt, id) { if (!rt->netnsok && !net_eq(net, &init_net)) continue; if (n++ < fams_to_skip) continue; if (ctrl_fill_info(rt, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, CTRL_CMD_NEWFAMILY) < 0) { n--; break; } } cb->args[0] = n; return skb->len; } static struct sk_buff *ctrl_build_family_msg(const struct genl_family *family, u32 portid, int seq, u8 cmd) { struct sk_buff *skb; int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb == NULL) return ERR_PTR(-ENOBUFS); err = ctrl_fill_info(family, portid, seq, 0, skb, cmd); if (err < 0) { nlmsg_free(skb); return ERR_PTR(err); } return skb; } static struct sk_buff * ctrl_build_mcgrp_msg(const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id, u32 portid, int seq, u8 cmd) { struct sk_buff *skb; int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb == NULL) return ERR_PTR(-ENOBUFS); err = ctrl_fill_mcgrp_info(family, grp, grp_id, portid, seq, 0, skb, cmd); if (err < 0) { nlmsg_free(skb); return ERR_PTR(err); } return skb; } static const struct nla_policy ctrl_policy_family[] = { [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, .len = GENL_NAMSIZ - 1 }, }; static int ctrl_getfamily(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; const struct genl_family *res = NULL; int err = -EINVAL; if (info->attrs[CTRL_ATTR_FAMILY_ID]) { u16 id = nla_get_u16(info->attrs[CTRL_ATTR_FAMILY_ID]); res = genl_family_find_byid(id); err = -ENOENT; } if (info->attrs[CTRL_ATTR_FAMILY_NAME]) { char *name; name = nla_data(info->attrs[CTRL_ATTR_FAMILY_NAME]); res = genl_family_find_byname(name); #ifdef CONFIG_MODULES if (res == NULL) { genl_unlock(); up_read(&cb_lock); request_module("net-pf-%d-proto-%d-family-%s", PF_NETLINK, NETLINK_GENERIC, name); down_read(&cb_lock); genl_lock(); res = genl_family_find_byname(name); } #endif err = -ENOENT; } if (res == NULL) return err; if (!res->netnsok && !net_eq(genl_info_net(info), &init_net)) { /* family doesn't exist here */ return -ENOENT; } msg = ctrl_build_family_msg(res, info->snd_portid, info->snd_seq, CTRL_CMD_NEWFAMILY); if (IS_ERR(msg)) return PTR_ERR(msg); return genlmsg_reply(msg, info); } static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id) { struct sk_buff *msg; /* genl is still initialising */ if (!init_net.genl_sock) return 0; switch (event) { case CTRL_CMD_NEWFAMILY: case CTRL_CMD_DELFAMILY: WARN_ON(grp); msg = ctrl_build_family_msg(family, 0, 0, event); break; case CTRL_CMD_NEWMCAST_GRP: case CTRL_CMD_DELMCAST_GRP: BUG_ON(!grp); msg = ctrl_build_mcgrp_msg(family, grp, grp_id, 0, 0, event); break; default: return -EINVAL; } if (IS_ERR(msg)) return PTR_ERR(msg); if (!family->netnsok) { genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0, 0, GFP_KERNEL); } else { rcu_read_lock(); genlmsg_multicast_allns(&genl_ctrl, msg, 0, 0, GFP_ATOMIC); rcu_read_unlock(); } return 0; } struct ctrl_dump_policy_ctx { struct netlink_policy_dump_state *state; const struct genl_family *rt; struct genl_op_iter *op_iter; u32 op; u16 fam_id; u8 dump_map:1, single_op:1; }; static const struct nla_policy ctrl_policy_policy[] = { [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, .len = GENL_NAMSIZ - 1 }, [CTRL_ATTR_OP] = { .type = NLA_U32 }, }; static int ctrl_dumppolicy_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->info.attrs; const struct genl_family *rt; struct genl_op_iter i; int err; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME]) return -EINVAL; if (tb[CTRL_ATTR_FAMILY_ID]) { ctx->fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]); } else { rt = genl_family_find_byname( nla_data(tb[CTRL_ATTR_FAMILY_NAME])); if (!rt) return -ENOENT; ctx->fam_id = rt->id; } rt = genl_family_find_byid(ctx->fam_id); if (!rt) return -ENOENT; ctx->rt = rt; if (tb[CTRL_ATTR_OP]) { struct genl_split_ops doit, dump; ctx->single_op = true; ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]); err = genl_get_cmd_both(ctx->op, rt, &doit, &dump); if (err) { NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]); return err; } if (doit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, doit.policy, doit.maxattr); if (err) goto err_free_state; } if (dump.policy) { err = netlink_policy_dump_add_policy(&ctx->state, dump.policy, dump.maxattr); if (err) goto err_free_state; } if (!ctx->state) return -ENODATA; ctx->dump_map = 1; return 0; } ctx->op_iter = kmalloc(sizeof(*ctx->op_iter), GFP_KERNEL); if (!ctx->op_iter) return -ENOMEM; genl_op_iter_init(rt, ctx->op_iter); ctx->dump_map = genl_op_iter_next(ctx->op_iter); for (genl_op_iter_init(rt, &i); genl_op_iter_next(&i); ) { if (i.doit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, i.doit.policy, i.doit.maxattr); if (err) goto err_free_state; } if (i.dumpit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, i.dumpit.policy, i.dumpit.maxattr); if (err) goto err_free_state; } } if (!ctx->state) { err = -ENODATA; goto err_free_op_iter; } return 0; err_free_state: netlink_policy_dump_free(ctx->state); err_free_op_iter: kfree(ctx->op_iter); return err; } static void *ctrl_dumppolicy_prep(struct sk_buff *skb, struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &genl_ctrl, NLM_F_MULTI, CTRL_CMD_GETPOLICY); if (!hdr) return NULL; if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, ctx->fam_id)) return NULL; return hdr; } static int ctrl_dumppolicy_put_op(struct sk_buff *skb, struct netlink_callback *cb, struct genl_split_ops *doit, struct genl_split_ops *dumpit) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr *nest_pol, *nest_op; void *hdr; int idx; /* skip if we have nothing to show */ if (!doit->policy && !dumpit->policy) return 0; hdr = ctrl_dumppolicy_prep(skb, cb); if (!hdr) return -ENOBUFS; nest_pol = nla_nest_start(skb, CTRL_ATTR_OP_POLICY); if (!nest_pol) goto err; nest_op = nla_nest_start(skb, doit->cmd); if (!nest_op) goto err; if (doit->policy) { idx = netlink_policy_dump_get_policy_idx(ctx->state, doit->policy, doit->maxattr); if (nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx)) goto err; } if (dumpit->policy) { idx = netlink_policy_dump_get_policy_idx(ctx->state, dumpit->policy, dumpit->maxattr); if (nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx)) goto err; } nla_nest_end(skb, nest_op); nla_nest_end(skb, nest_pol); genlmsg_end(skb, hdr); return 0; err: genlmsg_cancel(skb, hdr); return -ENOBUFS; } static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; void *hdr; if (ctx->dump_map) { if (ctx->single_op) { struct genl_split_ops doit, dumpit; if (WARN_ON(genl_get_cmd_both(ctx->op, ctx->rt, &doit, &dumpit))) return -ENOENT; if (ctrl_dumppolicy_put_op(skb, cb, &doit, &dumpit)) return skb->len; /* done with the per-op policy index list */ ctx->dump_map = 0; } while (ctx->dump_map) { if (ctrl_dumppolicy_put_op(skb, cb, &ctx->op_iter->doit, &ctx->op_iter->dumpit)) return skb->len; ctx->dump_map = genl_op_iter_next(ctx->op_iter); } } while (netlink_policy_dump_loop(ctx->state)) { struct nlattr *nest; hdr = ctrl_dumppolicy_prep(skb, cb); if (!hdr) goto nla_put_failure; nest = nla_nest_start(skb, CTRL_ATTR_POLICY); if (!nest) goto nla_put_failure; if (netlink_policy_dump_write(skb, ctx->state)) goto nla_put_failure; nla_nest_end(skb, nest); genlmsg_end(skb, hdr); } return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); return skb->len; } static int ctrl_dumppolicy_done(struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; kfree(ctx->op_iter); netlink_policy_dump_free(ctx->state); return 0; } static const struct genl_split_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, .validate = GENL_DONT_VALIDATE_STRICT, .policy = ctrl_policy_family, .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1, .doit = ctrl_getfamily, .flags = GENL_CMD_CAP_DO, }, { .cmd = CTRL_CMD_GETFAMILY, .validate = GENL_DONT_VALIDATE_DUMP, .policy = ctrl_policy_family, .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1, .dumpit = ctrl_dumpfamily, .flags = GENL_CMD_CAP_DUMP, }, { .cmd = CTRL_CMD_GETPOLICY, .policy = ctrl_policy_policy, .maxattr = ARRAY_SIZE(ctrl_policy_policy) - 1, .start = ctrl_dumppolicy_start, .dumpit = ctrl_dumppolicy, .done = ctrl_dumppolicy_done, .flags = GENL_CMD_CAP_DUMP, }, }; static const struct genl_multicast_group genl_ctrl_groups[] = { { .name = "notify", }, }; static struct genl_family genl_ctrl __ro_after_init = { .module = THIS_MODULE, .split_ops = genl_ctrl_ops, .n_split_ops = ARRAY_SIZE(genl_ctrl_ops), .resv_start_op = CTRL_CMD_GETPOLICY + 1, .mcgrps = genl_ctrl_groups, .n_mcgrps = ARRAY_SIZE(genl_ctrl_groups), .id = GENL_ID_CTRL, .name = "nlctrl", .version = 0x2, .netnsok = true, }; static int genl_bind(struct net *net, int group) { const struct genl_family *family; unsigned int id; int ret = 0; down_read(&cb_lock); idr_for_each_entry(&genl_fam_idr, family, id) { const struct genl_multicast_group *grp; int i; if (family->n_mcgrps == 0) continue; i = group - family->mcgrp_offset; if (i < 0 || i >= family->n_mcgrps) continue; grp = &family->mcgrps[i]; if ((grp->flags & GENL_MCAST_CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) ret = -EPERM; if ((grp->flags & GENL_MCAST_CAP_SYS_ADMIN) && !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; break; } up_read(&cb_lock); return ret; } static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, .bind = genl_bind, .release = genl_release, }; /* we'll bump the group number right afterwards */ net->genl_sock = netlink_kernel_create(net, NETLINK_GENERIC, &cfg); if (!net->genl_sock && net_eq(net, &init_net)) panic("GENL: Cannot initialize generic netlink\n"); if (!net->genl_sock) return -ENOMEM; return 0; } static void __net_exit genl_pernet_exit(struct net *net) { netlink_kernel_release(net->genl_sock); net->genl_sock = NULL; } static struct pernet_operations genl_pernet_ops = { .init = genl_pernet_init, .exit = genl_pernet_exit, }; static int __init genl_init(void) { int err; err = genl_register_family(&genl_ctrl); if (err < 0) goto problem; err = register_pernet_subsys(&genl_pernet_ops); if (err) goto problem; return 0; problem: panic("GENL: Cannot register controller: %d\n", err); } core_initcall(genl_init); static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, gfp_t flags) { struct sk_buff *tmp; struct net *net, *prev = NULL; bool delivered = false; int err; for_each_net_rcu(net) { if (prev) { tmp = skb_clone(skb, flags); if (!tmp) { err = -ENOMEM; goto error; } err = nlmsg_multicast(prev->genl_sock, tmp, portid, group, flags); if (!err) delivered = true; else if (err != -ESRCH) goto error; } prev = net; } err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags); if (!err) delivered = true; else if (err != -ESRCH) return err; return delivered ? 0 : -ESRCH; error: kfree_skb(skb); return err; } int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group, flags); } EXPORT_SYMBOL(genlmsg_multicast_allns); void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags) { struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; if (WARN_ON_ONCE(group >= family->n_mcgrps)) return; group = family->mcgrp_offset + group; nlmsg_notify(sk, skb, info->snd_portid, group, nlmsg_report(info->nlhdr), flags); } EXPORT_SYMBOL(genl_notify);
438 5 50 51 51 51 51 51 1 1 1 25 25 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 /* * INETPEER - A storage for permanent information about peers * * This source is covered by the GNU GPL, the same as all kernel sources. * * Authors: Andrey V. Savochkin <saw@msu.ru> */ #include <linux/cache.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/random.h> #include <linux/timer.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/net.h> #include <linux/workqueue.h> #include <net/ip.h> #include <net/inetpeer.h> #include <net/secure_seq.h> /* * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. * * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can * also be removed if the pool is overloaded i.e. if the total amount of * entries is greater-or-equal than the threshold. * * Node pool is organised as an RB tree. * Such an implementation has been chosen not just for fun. It's a way to * prevent easy and efficient DoS attacks by creating hash collisions. A huge * amount of long living nodes in a single hash slot would significantly delay * lookups performed with disabled BHs. * * Serialisation issues. * 1. Nodes may appear in the tree only with the pool lock held. * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. * 3. Global variable peer_total is modified under the pool lock. * 4. struct inet_peer fields modification: * rb_node: pool lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable */ static struct kmem_cache *peer_cachep __ro_after_init; void inet_peer_base_init(struct inet_peer_base *bp) { bp->rb_root = RB_ROOT; seqlock_init(&bp->lock); bp->total = 0; } EXPORT_SYMBOL_GPL(inet_peer_base_init); #define PEER_MAX_GC 32 /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold __read_mostly; /* start to throw entries more * aggressively at this stage */ int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) { u64 nr_entries; /* 1% of physical memory */ nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT, 100 * L1_CACHE_ALIGN(sizeof(struct inet_peer))); inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128); peer_cachep = kmem_cache_create("inet_peer_cache", sizeof(struct inet_peer), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } /* Called with rcu_read_lock() or base->lock held */ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, struct inet_peer_base *base, unsigned int seq, struct inet_peer *gc_stack[], unsigned int *gc_cnt, struct rb_node **parent_p, struct rb_node ***pp_p) { struct rb_node **pp, *parent, *next; struct inet_peer *p; pp = &base->rb_root.rb_node; parent = NULL; while (1) { int cmp; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { if (!refcount_inc_not_zero(&p->refcnt)) break; return p; } if (gc_stack) { if (*gc_cnt < PEER_MAX_GC) gc_stack[(*gc_cnt)++] = p; } else if (unlikely(read_seqretry(&base->lock, seq))) { break; } if (cmp == -1) pp = &next->rb_left; else pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; return NULL; } static void inetpeer_free_rcu(struct rcu_head *head) { kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); } /* perform garbage collect on all items stacked during a lookup */ static void inet_peer_gc(struct inet_peer_base *base, struct inet_peer *gc_stack[], unsigned int gc_cnt) { int peer_threshold, peer_maxttl, peer_minttl; struct inet_peer *p; __u32 delta, ttl; int i; peer_threshold = READ_ONCE(inet_peer_threshold); peer_maxttl = READ_ONCE(inet_peer_maxttl); peer_minttl = READ_ONCE(inet_peer_minttl); if (base->total >= peer_threshold) ttl = 0; /* be aggressive */ else ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ * base->total / peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; /* The READ_ONCE() pairs with the WRITE_ONCE() * in inet_putpeer() */ delta = (__u32)jiffies - READ_ONCE(p->dtime); if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; if (p) { rb_erase(&p->rb_node, &base->rb_root); base->total--; call_rcu(&p->rcu, inetpeer_free_rcu); } } } struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, int create) { struct inet_peer *p, *gc_stack[PEER_MAX_GC]; struct rb_node **pp, *parent; unsigned int gc_cnt, seq; int invalidated; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ rcu_read_lock(); seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); invalidated = read_seqretry(&base->lock, seq); rcu_read_unlock(); if (p) return p; /* If no writer did a change during our lookup, we can return early. */ if (!create && !invalidated) return NULL; /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ parent = NULL; write_seqlock_bh(&base->lock); gc_cnt = 0; p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); if (!p && create) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; p->dtime = (__u32)jiffies; refcount_set(&p->refcnt, 2); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; p->n_redirects = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first * calculation of tokens is at its maximum. */ p->rate_last = jiffies - 60*HZ; rb_link_node(&p->rb_node, parent, pp); rb_insert_color(&p->rb_node, &base->rb_root); base->total++; } } if (gc_cnt) inet_peer_gc(base, gc_stack, gc_cnt); write_sequnlock_bh(&base->lock); return p; } EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { /* The WRITE_ONCE() pairs with itself (we run lockless) * and the READ_ONCE() in inet_peer_gc() */ WRITE_ONCE(p->dtime, (__u32)jiffies); if (refcount_dec_and_test(&p->refcnt)) call_rcu(&p->rcu, inetpeer_free_rcu); } EXPORT_SYMBOL_GPL(inet_putpeer); /* * Check transmit rate limitation for given message. * The rate information is held in the inet_peer entries now. * This function is generic and could be used for other purposes * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. * * Note that the same inet_peer fields are modified by functions in * route.c too, but these work for packet destinations while xrlim_allow * works for icmp destinations. This means the rate limiting information * for one "ip object" is shared - and these ICMPs are twice limited: * by source and by destination. * * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate * SHOULD allow setting of rate limits * * Shared between ICMPv4 and ICMPv6. */ #define XRLIM_BURST_FACTOR 6 bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) { unsigned long now, token; bool rc = false; if (!peer) return true; token = peer->rate_tokens; now = jiffies; token += now - peer->rate_last; peer->rate_last = now; if (token > XRLIM_BURST_FACTOR * timeout) token = XRLIM_BURST_FACTOR * timeout; if (token >= timeout) { token -= timeout; rc = true; } peer->rate_tokens = token; return rc; } EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { struct rb_node *p = rb_first(&base->rb_root); while (p) { struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); p = rb_next(p); rb_erase(&peer->rb_node, &base->rb_root); inet_putpeer(peer); cond_resched(); } base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree);
12 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_PAGE_IDLE_H #define _LINUX_MM_PAGE_IDLE_H #include <linux/bitops.h> #include <linux/page-flags.h> #include <linux/page_ext.h> #ifdef CONFIG_PAGE_IDLE_FLAG #ifndef CONFIG_64BIT /* * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ static inline bool folio_test_young(struct folio *folio) { struct page_ext *page_ext = page_ext_get(&folio->page); bool page_young; if (unlikely(!page_ext)) return false; page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags); page_ext_put(page_ext); return page_young; } static inline void folio_set_young(struct folio *folio) { struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, &page_ext->flags); page_ext_put(page_ext); } static inline bool folio_test_clear_young(struct folio *folio) { struct page_ext *page_ext = page_ext_get(&folio->page); bool page_young; if (unlikely(!page_ext)) return false; page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); page_ext_put(page_ext); return page_young; } static inline bool folio_test_idle(struct folio *folio) { struct page_ext *page_ext = page_ext_get(&folio->page); bool page_idle; if (unlikely(!page_ext)) return false; page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags); page_ext_put(page_ext); return page_idle; } static inline void folio_set_idle(struct folio *folio) { struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, &page_ext->flags); page_ext_put(page_ext); } static inline void folio_clear_idle(struct folio *folio) { struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, &page_ext->flags); page_ext_put(page_ext); } #endif /* !CONFIG_64BIT */ #else /* !CONFIG_PAGE_IDLE_FLAG */ static inline bool folio_test_young(struct folio *folio) { return false; } static inline void folio_set_young(struct folio *folio) { } static inline bool folio_test_clear_young(struct folio *folio) { return false; } static inline bool folio_test_idle(struct folio *folio) { return false; } static inline void folio_set_idle(struct folio *folio) { } static inline void folio_clear_idle(struct folio *folio) { } #endif /* CONFIG_PAGE_IDLE_FLAG */ static inline bool page_is_young(struct page *page) { return folio_test_young(page_folio(page)); } static inline void set_page_young(struct page *page) { folio_set_young(page_folio(page)); } static inline bool test_and_clear_page_young(struct page *page) { return folio_test_clear_young(page_folio(page)); } static inline bool page_is_idle(struct page *page) { return folio_test_idle(page_folio(page)); } static inline void set_page_idle(struct page *page) { folio_set_idle(page_folio(page)); } #endif /* _LINUX_MM_PAGE_IDLE_H */
10 8 3 10 10 4 7 8 3 10 10 9 9 9 9 9 83 3 3 5 5 5 2 5 5 2 1 1 1 1 1 1 1 5 1 4 2 3 3 3 3 3 5 5 5 5 2 2 1 2 2 2 4 2 4 2 2 2 2 1 1 1 1 1 1 1 5 1 1 5 2 5 2 5 3 3 6 6 6 4 4 4 4 4 1 2 2 2 2 94 29 82 2 13 24 78 96 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 // SPDX-License-Identifier: GPL-2.0 /* * consolemap.c * * Mapping from internal code (such as Latin-1 or Unicode or IBM PC code) * to font positions. * * aeb, 950210 * * Support for multiple unimaps by Jakub Jelinek <jj@ultra.linux.cz>, July 1998 * * Fix bug in inverse translation. Stanislav Voronyi <stas@cnti.uanet.kharkov.ua>, Dec 1998 * * In order to prevent the following circular lock dependency: * &mm->mmap_lock --> cpu_hotplug.lock --> console_lock --> &mm->mmap_lock * * We cannot allow page fault to happen while holding the console_lock. * Therefore, all the userspace copy operations have to be done outside * the console_lock critical sections. * * As all the affected functions are all called directly from vt_ioctl(), we * can allocate some small buffers directly on stack without worrying about * stack overflow. */ #include <linux/bitfield.h> #include <linux/bits.h> #include <linux/module.h> #include <linux/kd.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/tty.h> #include <linux/uaccess.h> #include <linux/console.h> #include <linux/consolemap.h> #include <linux/vt_kern.h> #include <linux/string.h> static unsigned short translations[][E_TABSZ] = { /* 8-bit Latin-1 mapped to Unicode -- trivial mapping */ [LAT1_MAP] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff }, /* VT100 graphics mapped to Unicode */ [GRAF_MAP] = { 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x2192, 0x2190, 0x2191, 0x2193, 0x002f, 0x2588, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x00a0, 0x25c6, 0x2592, 0x2409, 0x240c, 0x240d, 0x240a, 0x00b0, 0x00b1, 0x2591, 0x240b, 0x2518, 0x2510, 0x250c, 0x2514, 0x253c, 0x23ba, 0x23bb, 0x2500, 0x23bc, 0x23bd, 0x251c, 0x2524, 0x2534, 0x252c, 0x2502, 0x2264, 0x2265, 0x03c0, 0x2260, 0x00a3, 0x00b7, 0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff }, /* IBM Codepage 437 mapped to Unicode */ [IBMPC_MAP] = { 0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022, 0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b, 0x263c, 0x25b6, 0x25c0, 0x2195, 0x203c, 0x00b6, 0x00a7, 0x25ac, 0x21a8, 0x2191, 0x2193, 0x2192, 0x2190, 0x221f, 0x2194, 0x25b2, 0x25bc, 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x2302, 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229, 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0 }, /* User mapping -- default to codes for direct font mapping */ [USER_MAP] = { 0xf000, 0xf001, 0xf002, 0xf003, 0xf004, 0xf005, 0xf006, 0xf007, 0xf008, 0xf009, 0xf00a, 0xf00b, 0xf00c, 0xf00d, 0xf00e, 0xf00f, 0xf010, 0xf011, 0xf012, 0xf013, 0xf014, 0xf015, 0xf016, 0xf017, 0xf018, 0xf019, 0xf01a, 0xf01b, 0xf01c, 0xf01d, 0xf01e, 0xf01f, 0xf020, 0xf021, 0xf022, 0xf023, 0xf024, 0xf025, 0xf026, 0xf027, 0xf028, 0xf029, 0xf02a, 0xf02b, 0xf02c, 0xf02d, 0xf02e, 0xf02f, 0xf030, 0xf031, 0xf032, 0xf033, 0xf034, 0xf035, 0xf036, 0xf037, 0xf038, 0xf039, 0xf03a, 0xf03b, 0xf03c, 0xf03d, 0xf03e, 0xf03f, 0xf040, 0xf041, 0xf042, 0xf043, 0xf044, 0xf045, 0xf046, 0xf047, 0xf048, 0xf049, 0xf04a, 0xf04b, 0xf04c, 0xf04d, 0xf04e, 0xf04f, 0xf050, 0xf051, 0xf052, 0xf053, 0xf054, 0xf055, 0xf056, 0xf057, 0xf058, 0xf059, 0xf05a, 0xf05b, 0xf05c, 0xf05d, 0xf05e, 0xf05f, 0xf060, 0xf061, 0xf062, 0xf063, 0xf064, 0xf065, 0xf066, 0xf067, 0xf068, 0xf069, 0xf06a, 0xf06b, 0xf06c, 0xf06d, 0xf06e, 0xf06f, 0xf070, 0xf071, 0xf072, 0xf073, 0xf074, 0xf075, 0xf076, 0xf077, 0xf078, 0xf079, 0xf07a, 0xf07b, 0xf07c, 0xf07d, 0xf07e, 0xf07f, 0xf080, 0xf081, 0xf082, 0xf083, 0xf084, 0xf085, 0xf086, 0xf087, 0xf088, 0xf089, 0xf08a, 0xf08b, 0xf08c, 0xf08d, 0xf08e, 0xf08f, 0xf090, 0xf091, 0xf092, 0xf093, 0xf094, 0xf095, 0xf096, 0xf097, 0xf098, 0xf099, 0xf09a, 0xf09b, 0xf09c, 0xf09d, 0xf09e, 0xf09f, 0xf0a0, 0xf0a1, 0xf0a2, 0xf0a3, 0xf0a4, 0xf0a5, 0xf0a6, 0xf0a7, 0xf0a8, 0xf0a9, 0xf0aa, 0xf0ab, 0xf0ac, 0xf0ad, 0xf0ae, 0xf0af, 0xf0b0, 0xf0b1, 0xf0b2, 0xf0b3, 0xf0b4, 0xf0b5, 0xf0b6, 0xf0b7, 0xf0b8, 0xf0b9, 0xf0ba, 0xf0bb, 0xf0bc, 0xf0bd, 0xf0be, 0xf0bf, 0xf0c0, 0xf0c1, 0xf0c2, 0xf0c3, 0xf0c4, 0xf0c5, 0xf0c6, 0xf0c7, 0xf0c8, 0xf0c9, 0xf0ca, 0xf0cb, 0xf0cc, 0xf0cd, 0xf0ce, 0xf0cf, 0xf0d0, 0xf0d1, 0xf0d2, 0xf0d3, 0xf0d4, 0xf0d5, 0xf0d6, 0xf0d7, 0xf0d8, 0xf0d9, 0xf0da, 0xf0db, 0xf0dc, 0xf0dd, 0xf0de, 0xf0df, 0xf0e0, 0xf0e1, 0xf0e2, 0xf0e3, 0xf0e4, 0xf0e5, 0xf0e6, 0xf0e7, 0xf0e8, 0xf0e9, 0xf0ea, 0xf0eb, 0xf0ec, 0xf0ed, 0xf0ee, 0xf0ef, 0xf0f0, 0xf0f1, 0xf0f2, 0xf0f3, 0xf0f4, 0xf0f5, 0xf0f6, 0xf0f7, 0xf0f8, 0xf0f9, 0xf0fa, 0xf0fb, 0xf0fc, 0xf0fd, 0xf0fe, 0xf0ff } }; /* The standard kernel character-to-font mappings are not invertible -- this is just a best effort. */ #define MAX_GLYPH 512 /* Max possible glyph value */ static enum translation_map inv_translate[MAX_NR_CONSOLES]; #define UNI_DIRS 32U #define UNI_DIR_ROWS 32U #define UNI_ROW_GLYPHS 64U #define UNI_DIR_BITS GENMASK(15, 11) #define UNI_ROW_BITS GENMASK(10, 6) #define UNI_GLYPH_BITS GENMASK( 5, 0) #define UNI_DIR(uni) FIELD_GET(UNI_DIR_BITS, (uni)) #define UNI_ROW(uni) FIELD_GET(UNI_ROW_BITS, (uni)) #define UNI_GLYPH(uni) FIELD_GET(UNI_GLYPH_BITS, (uni)) #define UNI(dir, row, glyph) (FIELD_PREP(UNI_DIR_BITS, (dir)) | \ FIELD_PREP(UNI_ROW_BITS, (row)) | \ FIELD_PREP(UNI_GLYPH_BITS, (glyph))) /** * struct uni_pagedict - unicode directory * * @uni_pgdir: 32*32*64 table with glyphs * @refcount: reference count of this structure * @sum: checksum * @inverse_translations: best-effort inverse mapping * @inverse_trans_unicode: best-effort inverse mapping to unicode */ struct uni_pagedict { u16 **uni_pgdir[UNI_DIRS]; unsigned long refcount; unsigned long sum; unsigned char *inverse_translations[LAST_MAP + 1]; u16 *inverse_trans_unicode; }; static struct uni_pagedict *dflt; static void set_inverse_transl(struct vc_data *conp, struct uni_pagedict *dict, enum translation_map m) { unsigned short *t = translations[m]; unsigned char *inv; if (!dict) return; inv = dict->inverse_translations[m]; if (!inv) { inv = dict->inverse_translations[m] = kmalloc(MAX_GLYPH, GFP_KERNEL); if (!inv) return; } memset(inv, 0, MAX_GLYPH); for (unsigned int ch = 0; ch < ARRAY_SIZE(translations[m]); ch++) { int glyph = conv_uni_to_pc(conp, t[ch]); if (glyph >= 0 && glyph < MAX_GLYPH && inv[glyph] < 32) { /* prefer '-' above SHY etc. */ inv[glyph] = ch; } } } static void set_inverse_trans_unicode(struct uni_pagedict *dict) { unsigned int d, r, g; u16 *inv; if (!dict) return; inv = dict->inverse_trans_unicode; if (!inv) { inv = dict->inverse_trans_unicode = kmalloc_array(MAX_GLYPH, sizeof(*inv), GFP_KERNEL); if (!inv) return; } memset(inv, 0, MAX_GLYPH * sizeof(*inv)); for (d = 0; d < UNI_DIRS; d++) { u16 **dir = dict->uni_pgdir[d]; if (!dir) continue; for (r = 0; r < UNI_DIR_ROWS; r++) { u16 *row = dir[r]; if (!row) continue; for (g = 0; g < UNI_ROW_GLYPHS; g++) { u16 glyph = row[g]; if (glyph < MAX_GLYPH && inv[glyph] < 32) inv[glyph] = UNI(d, r, g); } } } } unsigned short *set_translate(enum translation_map m, struct vc_data *vc) { inv_translate[vc->vc_num] = m; return translations[m]; } /* * Inverse translation is impossible for several reasons: * 1. The font<->character maps are not 1-1. * 2. The text may have been written while a different translation map * was active. * Still, it is now possible to a certain extent to cut and paste non-ASCII. */ u16 inverse_translate(const struct vc_data *conp, u16 glyph, bool use_unicode) { struct uni_pagedict *p; enum translation_map m; if (glyph >= MAX_GLYPH) return 0; p = *conp->uni_pagedict_loc; if (!p) return glyph; if (use_unicode) { if (!p->inverse_trans_unicode) return glyph; return p->inverse_trans_unicode[glyph]; } m = inv_translate[conp->vc_num]; if (!p->inverse_translations[m]) return glyph; return p->inverse_translations[m][glyph]; } EXPORT_SYMBOL_GPL(inverse_translate); static void update_user_maps(void) { int i; struct uni_pagedict *p, *q = NULL; for (i = 0; i < MAX_NR_CONSOLES; i++) { if (!vc_cons_allocated(i)) continue; p = *vc_cons[i].d->uni_pagedict_loc; if (p && p != q) { set_inverse_transl(vc_cons[i].d, p, USER_MAP); set_inverse_trans_unicode(p); q = p; } } } /* * Load customizable translation table * arg points to a 256 byte translation table. * * The "old" variants are for translation directly to font (using the * 0xf000-0xf0ff "transparent" Unicodes) whereas the "new" variants set * Unicodes explicitly. */ int con_set_trans_old(unsigned char __user * arg) { unsigned short inbuf[E_TABSZ]; unsigned int i; unsigned char ch; for (i = 0; i < ARRAY_SIZE(inbuf); i++) { if (get_user(ch, &arg[i])) return -EFAULT; inbuf[i] = UNI_DIRECT_BASE | ch; } console_lock(); memcpy(translations[USER_MAP], inbuf, sizeof(inbuf)); update_user_maps(); console_unlock(); return 0; } int con_get_trans_old(unsigned char __user * arg) { int i, ch; unsigned short *p = translations[USER_MAP]; unsigned char outbuf[E_TABSZ]; console_lock(); for (i = 0; i < ARRAY_SIZE(outbuf); i++) { ch = conv_uni_to_pc(vc_cons[fg_console].d, p[i]); outbuf[i] = (ch & ~0xff) ? 0 : ch; } console_unlock(); return copy_to_user(arg, outbuf, sizeof(outbuf)) ? -EFAULT : 0; } int con_set_trans_new(ushort __user * arg) { unsigned short inbuf[E_TABSZ]; if (copy_from_user(inbuf, arg, sizeof(inbuf))) return -EFAULT; console_lock(); memcpy(translations[USER_MAP], inbuf, sizeof(inbuf)); update_user_maps(); console_unlock(); return 0; } int con_get_trans_new(ushort __user * arg) { unsigned short outbuf[E_TABSZ]; console_lock(); memcpy(outbuf, translations[USER_MAP], sizeof(outbuf)); console_unlock(); return copy_to_user(arg, outbuf, sizeof(outbuf)) ? -EFAULT : 0; } /* * Unicode -> current font conversion * * A font has at most 512 chars, usually 256. * But one font position may represent several Unicode chars. * A hashtable is somewhat of a pain to deal with, so use a * "paged table" instead. Simulation has shown the memory cost of * this 3-level paged table scheme to be comparable to a hash table. */ extern u8 dfont_unicount[]; /* Defined in console_defmap.c */ extern u16 dfont_unitable[]; static void con_release_unimap(struct uni_pagedict *dict) { unsigned int d, r; if (dict == dflt) dflt = NULL; for (d = 0; d < UNI_DIRS; d++) { u16 **dir = dict->uni_pgdir[d]; if (dir != NULL) { for (r = 0; r < UNI_DIR_ROWS; r++) kfree(dir[r]); kfree(dir); } dict->uni_pgdir[d] = NULL; } for (r = 0; r < ARRAY_SIZE(dict->inverse_translations); r++) { kfree(dict->inverse_translations[r]); dict->inverse_translations[r] = NULL; } kfree(dict->inverse_trans_unicode); dict->inverse_trans_unicode = NULL; } /* Caller must hold the console lock */ void con_free_unimap(struct vc_data *vc) { struct uni_pagedict *p; p = *vc->uni_pagedict_loc; if (!p) return; *vc->uni_pagedict_loc = NULL; if (--p->refcount) return; con_release_unimap(p); kfree(p); } static int con_unify_unimap(struct vc_data *conp, struct uni_pagedict *dict1) { struct uni_pagedict *dict2; unsigned int cons, d, r; for (cons = 0; cons < MAX_NR_CONSOLES; cons++) { if (!vc_cons_allocated(cons)) continue; dict2 = *vc_cons[cons].d->uni_pagedict_loc; if (!dict2 || dict2 == dict1 || dict2->sum != dict1->sum) continue; for (d = 0; d < UNI_DIRS; d++) { u16 **dir1 = dict1->uni_pgdir[d]; u16 **dir2 = dict2->uni_pgdir[d]; if (!dir1 && !dir2) continue; if (!dir1 || !dir2) break; for (r = 0; r < UNI_DIR_ROWS; r++) { if (!dir1[r] && !dir2[r]) continue; if (!dir1[r] || !dir2[r]) break; if (memcmp(dir1[r], dir2[r], UNI_ROW_GLYPHS * sizeof(*dir1[r]))) break; } if (r < UNI_DIR_ROWS) break; } if (d == UNI_DIRS) { dict2->refcount++; *conp->uni_pagedict_loc = dict2; con_release_unimap(dict1); kfree(dict1); return 1; } } return 0; } static int con_insert_unipair(struct uni_pagedict *p, u_short unicode, u_short fontpos) { u16 **dir, *row; unsigned int n; n = UNI_DIR(unicode); dir = p->uni_pgdir[n]; if (!dir) { dir = p->uni_pgdir[n] = kcalloc(UNI_DIR_ROWS, sizeof(*dir), GFP_KERNEL); if (!dir) return -ENOMEM; } n = UNI_ROW(unicode); row = dir[n]; if (!row) { row = dir[n] = kmalloc_array(UNI_ROW_GLYPHS, sizeof(*row), GFP_KERNEL); if (!row) return -ENOMEM; /* No glyphs for the characters (yet) */ memset(row, 0xff, UNI_ROW_GLYPHS * sizeof(*row)); } row[UNI_GLYPH(unicode)] = fontpos; p->sum += (fontpos << 20U) + unicode; return 0; } static int con_allocate_new(struct vc_data *vc) { struct uni_pagedict *new, *old = *vc->uni_pagedict_loc; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; new->refcount = 1; *vc->uni_pagedict_loc = new; if (old) old->refcount--; return 0; } /* Caller must hold the lock */ static int con_do_clear_unimap(struct vc_data *vc) { struct uni_pagedict *old = *vc->uni_pagedict_loc; if (!old || old->refcount > 1) return con_allocate_new(vc); old->sum = 0; con_release_unimap(old); return 0; } int con_clear_unimap(struct vc_data *vc) { int ret; console_lock(); ret = con_do_clear_unimap(vc); console_unlock(); return ret; } static struct uni_pagedict *con_unshare_unimap(struct vc_data *vc, struct uni_pagedict *old) { struct uni_pagedict *new; unsigned int d, r, g; int ret; u16 uni = 0; ret = con_allocate_new(vc); if (ret) return ERR_PTR(ret); new = *vc->uni_pagedict_loc; /* * uni_pgdir is a 32*32*64 table with rows allocated when its first * entry is added. The unicode value must still be incremented for * empty rows. We are copying entries from "old" to "new". */ for (d = 0; d < UNI_DIRS; d++) { u16 **dir = old->uni_pgdir[d]; if (!dir) { /* Account for empty table */ uni += UNI_DIR_ROWS * UNI_ROW_GLYPHS; continue; } for (r = 0; r < UNI_DIR_ROWS; r++) { u16 *row = dir[r]; if (!row) { /* Account for row of 64 empty entries */ uni += UNI_ROW_GLYPHS; continue; } for (g = 0; g < UNI_ROW_GLYPHS; g++, uni++) { if (row[g] == 0xffff) continue; /* * Found one, copy entry for unicode uni with * fontpos value row[g]. */ ret = con_insert_unipair(new, uni, row[g]); if (ret) { old->refcount++; *vc->uni_pagedict_loc = old; con_release_unimap(new); kfree(new); return ERR_PTR(ret); } } } } return new; } int con_set_unimap(struct vc_data *vc, ushort ct, struct unipair __user *list) { int err = 0, err1; struct uni_pagedict *dict; struct unipair *unilist, *plist; if (!ct) return 0; unilist = vmemdup_array_user(list, ct, sizeof(*unilist)); if (IS_ERR(unilist)) return PTR_ERR(unilist); console_lock(); /* Save original vc_unipagdir_loc in case we allocate a new one */ dict = *vc->uni_pagedict_loc; if (!dict) { err = -EINVAL; goto out_unlock; } if (dict->refcount > 1) { dict = con_unshare_unimap(vc, dict); if (IS_ERR(dict)) { err = PTR_ERR(dict); goto out_unlock; } } else if (dict == dflt) { dflt = NULL; } /* * Insert user specified unicode pairs into new table. */ for (plist = unilist; ct; ct--, plist++) { err1 = con_insert_unipair(dict, plist->unicode, plist->fontpos); if (err1) err = err1; } /* * Merge with fontmaps of any other virtual consoles. */ if (con_unify_unimap(vc, dict)) goto out_unlock; for (enum translation_map m = FIRST_MAP; m <= LAST_MAP; m++) set_inverse_transl(vc, dict, m); set_inverse_trans_unicode(dict); out_unlock: console_unlock(); kvfree(unilist); return err; } /** * con_set_default_unimap - set default unicode map * @vc: the console we are updating * * Loads the unimap for the hardware font, as defined in uni_hash.tbl. * The representation used was the most compact I could come up * with. This routine is executed at video setup, and when the * PIO_FONTRESET ioctl is called. * * The caller must hold the console lock */ int con_set_default_unimap(struct vc_data *vc) { struct uni_pagedict *dict; unsigned int fontpos, count; int err = 0, err1; u16 *dfont; if (dflt) { dict = *vc->uni_pagedict_loc; if (dict == dflt) return 0; dflt->refcount++; *vc->uni_pagedict_loc = dflt; if (dict && !--dict->refcount) { con_release_unimap(dict); kfree(dict); } return 0; } /* The default font is always 256 characters */ err = con_do_clear_unimap(vc); if (err) return err; dict = *vc->uni_pagedict_loc; dfont = dfont_unitable; for (fontpos = 0; fontpos < 256U; fontpos++) for (count = dfont_unicount[fontpos]; count; count--) { err1 = con_insert_unipair(dict, *(dfont++), fontpos); if (err1) err = err1; } if (con_unify_unimap(vc, dict)) { dflt = *vc->uni_pagedict_loc; return err; } for (enum translation_map m = FIRST_MAP; m <= LAST_MAP; m++) set_inverse_transl(vc, dict, m); set_inverse_trans_unicode(dict); dflt = dict; return err; } EXPORT_SYMBOL(con_set_default_unimap); /** * con_copy_unimap - copy unimap between two vts * @dst_vc: target * @src_vc: source * * The caller must hold the console lock when invoking this method */ int con_copy_unimap(struct vc_data *dst_vc, struct vc_data *src_vc) { struct uni_pagedict *src; if (!*src_vc->uni_pagedict_loc) return -EINVAL; if (*dst_vc->uni_pagedict_loc == *src_vc->uni_pagedict_loc) return 0; con_free_unimap(dst_vc); src = *src_vc->uni_pagedict_loc; src->refcount++; *dst_vc->uni_pagedict_loc = src; return 0; } EXPORT_SYMBOL(con_copy_unimap); /* * con_get_unimap - get the unicode map * * Read the console unicode data for this console. Called from the ioctl * handlers. */ int con_get_unimap(struct vc_data *vc, ushort ct, ushort __user *uct, struct unipair __user *list) { ushort ect; struct uni_pagedict *dict; struct unipair *unilist; unsigned int d, r, g; int ret = 0; unilist = kvmalloc_array(ct, sizeof(*unilist), GFP_KERNEL); if (!unilist) return -ENOMEM; console_lock(); ect = 0; dict = *vc->uni_pagedict_loc; if (!dict) goto unlock; for (d = 0; d < UNI_DIRS; d++) { u16 **dir = dict->uni_pgdir[d]; if (!dir) continue; for (r = 0; r < UNI_DIR_ROWS; r++) { u16 *row = dir[r]; if (!row) continue; for (g = 0; g < UNI_ROW_GLYPHS; g++, row++) { if (*row >= MAX_GLYPH) continue; if (ect < ct) { unilist[ect].unicode = UNI(d, r, g); unilist[ect].fontpos = *row; } ect++; } } } unlock: console_unlock(); if (copy_to_user(list, unilist, min(ect, ct) * sizeof(*unilist))) ret = -EFAULT; if (put_user(ect, uct)) ret = -EFAULT; kvfree(unilist); return ret ? ret : (ect <= ct) ? 0 : -ENOMEM; } /* * Always use USER_MAP. These functions are used by the keyboard, * which shouldn't be affected by G0/G1 switching, etc. * If the user map still contains default values, i.e. the * direct-to-font mapping, then assume user is using Latin1. * * FIXME: at some point we need to decide if we want to lock the table * update element itself via the keyboard_event_lock for consistency with the * keyboard driver as well as the consoles */ /* may be called during an interrupt */ u32 conv_8bit_to_uni(unsigned char c) { unsigned short uni = translations[USER_MAP][c]; return uni == (0xf000 | c) ? c : uni; } int conv_uni_to_8bit(u32 uni) { int c; for (c = 0; c < ARRAY_SIZE(translations[USER_MAP]); c++) if (translations[USER_MAP][c] == uni || (translations[USER_MAP][c] == (c | 0xf000) && uni == c)) return c; return -1; } int conv_uni_to_pc(struct vc_data *conp, long ucs) { struct uni_pagedict *dict; u16 **dir, *row, glyph; /* Only 16-bit codes supported at this time */ if (ucs > 0xffff) return -4; /* Not found */ else if (ucs < 0x20) return -1; /* Not a printable character */ else if (ucs == 0xfeff || (ucs >= 0x200b && ucs <= 0x200f)) return -2; /* Zero-width space */ /* * UNI_DIRECT_BASE indicates the start of the region in the User Zone * which always has a 1:1 mapping to the currently loaded font. The * UNI_DIRECT_MASK indicates the bit span of the region. */ else if ((ucs & ~UNI_DIRECT_MASK) == UNI_DIRECT_BASE) return ucs & UNI_DIRECT_MASK; dict = *conp->uni_pagedict_loc; if (!dict) return -3; dir = dict->uni_pgdir[UNI_DIR(ucs)]; if (!dir) return -4; row = dir[UNI_ROW(ucs)]; if (!row) return -4; glyph = row[UNI_GLYPH(ucs)]; if (glyph >= MAX_GLYPH) return -4; return glyph; } /* * This is called at sys_setup time, after memory and the console are * initialized. It must be possible to call kmalloc(..., GFP_KERNEL) * from this function, hence the call from sys_setup. */ void __init console_map_init(void) { int i; for (i = 0; i < MAX_NR_CONSOLES; i++) if (vc_cons_allocated(i) && !*vc_cons[i].d->uni_pagedict_loc) con_set_default_unimap(vc_cons[i].d); }
259 679 37 2169 2168 57 131 2167 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2019 Google LLC */ #ifndef __LINUX_BLK_CRYPTO_INTERNAL_H #define __LINUX_BLK_CRYPTO_INTERNAL_H #include <linux/bio.h> #include <linux/blk-mq.h> /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ unsigned int ivsize; /* iv size in bytes */ }; extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION int blk_crypto_sysfs_register(struct gendisk *disk); void blk_crypto_sysfs_unregister(struct gendisk *disk); void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio); bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, struct bio_crypt_ctx *bc2); static inline bool bio_crypt_ctx_back_mergeable(struct request *req, struct bio *bio) { return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), bio->bi_crypt_context); } static inline bool bio_crypt_ctx_front_mergeable(struct request *req, struct bio *bio) { return bio_crypt_ctx_mergeable(bio->bi_crypt_context, bio->bi_iter.bi_size, req->crypt_ctx); } static inline bool bio_crypt_ctx_merge_rq(struct request *req, struct request *next) { return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), next->crypt_ctx); } static inline void blk_crypto_rq_set_defaults(struct request *rq) { rq->crypt_ctx = NULL; rq->crypt_keyslot = NULL; } static inline bool blk_crypto_rq_is_encrypted(struct request *rq) { return rq->crypt_ctx; } static inline bool blk_crypto_rq_has_keyslot(struct request *rq) { return rq->crypt_keyslot; } blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, struct blk_crypto_keyslot **slot_ptr); void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); int __blk_crypto_evict_key(struct blk_crypto_profile *profile, const struct blk_crypto_key *key); bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, const struct blk_crypto_config *cfg); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) { return 0; } static inline void blk_crypto_sysfs_unregister(struct gendisk *disk) { } static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_front_mergeable(struct request *req, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_back_mergeable(struct request *req, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_merge_rq(struct request *req, struct request *next) { return true; } static inline void blk_crypto_rq_set_defaults(struct request *rq) { } static inline bool blk_crypto_rq_is_encrypted(struct request *rq) { return false; } static inline bool blk_crypto_rq_has_keyslot(struct request *rq) { return false; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes) { if (bio_has_crypt_ctx(bio)) __bio_crypt_advance(bio, bytes); } void __bio_crypt_free_ctx(struct bio *bio); static inline void bio_crypt_free_ctx(struct bio *bio) { if (bio_has_crypt_ctx(bio)) __bio_crypt_free_ctx(bio); } static inline void bio_crypt_do_front_merge(struct request *rq, struct bio *bio) { #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (bio_has_crypt_ctx(bio)) memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun, sizeof(rq->crypt_ctx->bc_dun)); #endif } bool __blk_crypto_bio_prep(struct bio **bio_ptr); static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) { if (bio_has_crypt_ctx(*bio_ptr)) return __blk_crypto_bio_prep(bio_ptr); return true; } blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) return __blk_crypto_rq_get_keyslot(rq); return BLK_STS_OK; } void __blk_crypto_rq_put_keyslot(struct request *rq); static inline void blk_crypto_rq_put_keyslot(struct request *rq) { if (blk_crypto_rq_has_keyslot(rq)) __blk_crypto_rq_put_keyslot(rq); } void __blk_crypto_free_request(struct request *rq); static inline void blk_crypto_free_request(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) __blk_crypto_free_request(rq); } int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask); /** * blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio * is inserted * @rq: The request to prepare * @bio: The first bio being inserted into the request * @gfp_mask: Memory allocation flags * * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. */ static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) { if (bio_has_crypt_ctx(bio)) return __blk_crypto_rq_bio_prep(rq, bio, gfp_mask); return 0; } #ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr); int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key); #else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ static inline int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) { pr_warn_once("crypto API fallback is disabled\n"); return -ENOPKG; } static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) { pr_warn_once("crypto API fallback disabled; failing request.\n"); (*bio_ptr)->bi_status = BLK_STS_NOTSUPP; return false; } static inline int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { return 0; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ #endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */
2 1 3 3 1 1 2 2 2 2 2 12 2 2 7 7 7 7 9 10 7 10 7 7 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 // SPDX-License-Identifier: GPL-2.0-only /* * irqchip.c: Common API for in kernel interrupt controllers * Copyright (c) 2007, Intel Corporation. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright (c) 2013, Alexander Graf <agraf@suse.de> * * This file is derived from virt/kvm/irq_comm.c. * * Authors: * Yaozu (Eddie) Dong <Eddie.dong@intel.com> * Alexander Graf <agraf@suse.de> */ #include <linux/kvm_host.h> #include <linux/slab.h> #include <linux/srcu.h> #include <linux/export.h> #include <trace/events/kvm.h> int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) { struct kvm_irq_routing_table *irq_rt; struct kvm_kernel_irq_routing_entry *e; int n = 0; irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, lockdep_is_held(&kvm->irq_lock)); if (irq_rt && gsi < irq_rt->nr_rt_entries) { hlist_for_each_entry(e, &irq_rt->map[gsi], link) { entries[n] = *e; ++n; } } return n; } int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) { struct kvm_irq_routing_table *irq_rt; irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); return irq_rt->chip[irqchip][pin]; } int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) { struct kvm_kernel_irq_routing_entry route; if (!kvm_arch_irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) return -EINVAL; route.msi.address_lo = msi->address_lo; route.msi.address_hi = msi->address_hi; route.msi.data = msi->data; route.msi.flags = msi->flags; route.msi.devid = msi->devid; return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); } /* * Return value: * < 0 Interrupt was ignored (masked or not delivered for other reasons) * = 0 Interrupt was coalesced (previous irq is still pending) * > 0 Number of CPUs interrupt was delivered to */ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS]; int ret = -1, i, idx; trace_kvm_set_irq(irq, level, irq_source_id); /* Not possible to detect if the guest uses the PIC or the * IOAPIC. So set the bit in both. The guest will ignore * writes to the unused one. */ idx = srcu_read_lock(&kvm->irq_srcu); i = kvm_irq_map_gsi(kvm, irq_set, irq); srcu_read_unlock(&kvm->irq_srcu, idx); while (i--) { int r; r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level, line_status); if (r < 0) continue; ret = r + ((ret < 0) ? 0 : ret); } return ret; } static void free_irq_routing_table(struct kvm_irq_routing_table *rt) { int i; if (!rt) return; for (i = 0; i < rt->nr_rt_entries; ++i) { struct kvm_kernel_irq_routing_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &rt->map[i], link) { hlist_del(&e->link); kfree(e); } } kfree(rt); } void kvm_free_irq_routing(struct kvm *kvm) { /* Called only during vm destruction. Nobody can use the pointer at this stage */ struct kvm_irq_routing_table *rt = rcu_access_pointer(kvm->irq_routing); free_irq_routing_table(rt); } static int setup_routing_entry(struct kvm *kvm, struct kvm_irq_routing_table *rt, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { struct kvm_kernel_irq_routing_entry *ei; int r; u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES); /* * Do not allow GSI to be mapped to the same irqchip more than once. * Allow only one to one mapping between GSI and non-irqchip routing. */ hlist_for_each_entry(ei, &rt->map[gsi], link) if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || ue->type != KVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return -EINVAL; e->gsi = gsi; e->type = ue->type; r = kvm_set_routing_entry(kvm, e, ue); if (r) return r; if (e->type == KVM_IRQ_ROUTING_IRQCHIP) rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi; hlist_add_head(&e->link, &rt->map[e->gsi]); return 0; } void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm) { } bool __weak kvm_arch_can_set_irq_routing(struct kvm *kvm) { return true; } int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *ue, unsigned nr, unsigned flags) { struct kvm_irq_routing_table *new, *old; struct kvm_kernel_irq_routing_entry *e; u32 i, j, nr_rt_entries = 0; int r; for (i = 0; i < nr; ++i) { if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) return -EINVAL; nr_rt_entries = max(nr_rt_entries, ue[i].gsi); } nr_rt_entries += 1; new = kzalloc(struct_size(new, map, nr_rt_entries), GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; new->nr_rt_entries = nr_rt_entries; for (i = 0; i < KVM_NR_IRQCHIPS; i++) for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++) new->chip[i][j] = -1; for (i = 0; i < nr; ++i) { r = -ENOMEM; e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT); if (!e) goto out; r = -EINVAL; switch (ue->type) { case KVM_IRQ_ROUTING_MSI: if (ue->flags & ~KVM_MSI_VALID_DEVID) goto free_entry; break; default: if (ue->flags) goto free_entry; break; } r = setup_routing_entry(kvm, new, e, ue); if (r) goto free_entry; ++ue; } mutex_lock(&kvm->irq_lock); old = rcu_dereference_protected(kvm->irq_routing, 1); rcu_assign_pointer(kvm->irq_routing, new); kvm_irq_routing_update(kvm); kvm_arch_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); kvm_arch_post_irq_routing_update(kvm); synchronize_srcu_expedited(&kvm->irq_srcu); new = old; r = 0; goto out; free_entry: kfree(e); out: free_irq_routing_table(new); return r; }
1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2002 Petko Manolov (petkan@users.sourceforge.net) */ #include <linux/signal.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/mii.h> #include <linux/ethtool.h> #include <linux/usb.h> #include <linux/uaccess.h> /* Version Information */ #define DRIVER_VERSION "v0.6.2 (2004/08/27)" #define DRIVER_AUTHOR "Petko Manolov <petkan@users.sourceforge.net>" #define DRIVER_DESC "rtl8150 based usb-ethernet driver" #define IDR 0x0120 #define MAR 0x0126 #define CR 0x012e #define TCR 0x012f #define RCR 0x0130 #define TSR 0x0132 #define RSR 0x0133 #define CON0 0x0135 #define CON1 0x0136 #define MSR 0x0137 #define PHYADD 0x0138 #define PHYDAT 0x0139 #define PHYCNT 0x013b #define GPPC 0x013d #define BMCR 0x0140 #define BMSR 0x0142 #define ANAR 0x0144 #define ANLP 0x0146 #define AER 0x0148 #define CSCR 0x014C /* This one has the link status */ #define CSCR_LINK_STATUS (1 << 3) #define IDR_EEPROM 0x1202 #define PHY_READ 0 #define PHY_WRITE 0x20 #define PHY_GO 0x40 #define MII_TIMEOUT 10 #define INTBUFSIZE 8 #define RTL8150_REQT_READ 0xc0 #define RTL8150_REQT_WRITE 0x40 #define RTL8150_REQ_GET_REGS 0x05 #define RTL8150_REQ_SET_REGS 0x05 /* Transmit status register errors */ #define TSR_ECOL (1<<5) #define TSR_LCOL (1<<4) #define TSR_LOSS_CRS (1<<3) #define TSR_JBR (1<<2) #define TSR_ERRORS (TSR_ECOL | TSR_LCOL | TSR_LOSS_CRS | TSR_JBR) /* Receive status register errors */ #define RSR_CRC (1<<2) #define RSR_FAE (1<<1) #define RSR_ERRORS (RSR_CRC | RSR_FAE) /* Media status register definitions */ #define MSR_DUPLEX (1<<4) #define MSR_SPEED (1<<3) #define MSR_LINK (1<<2) /* Interrupt pipe data */ #define INT_TSR 0x00 #define INT_RSR 0x01 #define INT_MSR 0x02 #define INT_WAKSR 0x03 #define INT_TXOK_CNT 0x04 #define INT_RXLOST_CNT 0x05 #define INT_CRERR_CNT 0x06 #define INT_COL_CNT 0x07 #define RTL8150_MTU 1540 #define RTL8150_TX_TIMEOUT (HZ) #define RX_SKB_POOL_SIZE 4 /* rtl8150 flags */ #define RTL8150_HW_CRC 0 #define RX_REG_SET 1 #define RTL8150_UNPLUG 2 #define RX_URB_FAIL 3 /* Define these values to match your device */ #define VENDOR_ID_REALTEK 0x0bda #define VENDOR_ID_MELCO 0x0411 #define VENDOR_ID_MICRONET 0x3980 #define VENDOR_ID_LONGSHINE 0x07b8 #define VENDOR_ID_OQO 0x1557 #define VENDOR_ID_ZYXEL 0x0586 #define PRODUCT_ID_RTL8150 0x8150 #define PRODUCT_ID_LUAKTX 0x0012 #define PRODUCT_ID_LCS8138TX 0x401a #define PRODUCT_ID_SP128AR 0x0003 #define PRODUCT_ID_PRESTIGE 0x401a #undef EEPROM_WRITE /* table of devices that work with this driver */ static const struct usb_device_id rtl8150_table[] = { {USB_DEVICE(VENDOR_ID_REALTEK, PRODUCT_ID_RTL8150)}, {USB_DEVICE(VENDOR_ID_MELCO, PRODUCT_ID_LUAKTX)}, {USB_DEVICE(VENDOR_ID_MICRONET, PRODUCT_ID_SP128AR)}, {USB_DEVICE(VENDOR_ID_LONGSHINE, PRODUCT_ID_LCS8138TX)}, {USB_DEVICE(VENDOR_ID_OQO, PRODUCT_ID_RTL8150)}, {USB_DEVICE(VENDOR_ID_ZYXEL, PRODUCT_ID_PRESTIGE)}, {} }; MODULE_DEVICE_TABLE(usb, rtl8150_table); struct rtl8150 { unsigned long flags; struct usb_device *udev; struct tasklet_struct tl; struct net_device *netdev; struct urb *rx_urb, *tx_urb, *intr_urb; struct sk_buff *tx_skb, *rx_skb; struct sk_buff *rx_skb_pool[RX_SKB_POOL_SIZE]; spinlock_t rx_pool_lock; struct usb_ctrlrequest dr; int intr_interval; u8 *intr_buff; u8 phy; }; typedef struct rtl8150 rtl8150_t; struct async_req { struct usb_ctrlrequest dr; u16 rx_creg; }; static const char driver_name [] = "rtl8150"; /* ** ** device related part of the code ** */ static int get_registers(rtl8150_t * dev, u16 indx, u16 size, void *data) { return usb_control_msg_recv(dev->udev, 0, RTL8150_REQ_GET_REGS, RTL8150_REQT_READ, indx, 0, data, size, 1000, GFP_NOIO); } static int set_registers(rtl8150_t * dev, u16 indx, u16 size, const void *data) { return usb_control_msg_send(dev->udev, 0, RTL8150_REQ_SET_REGS, RTL8150_REQT_WRITE, indx, 0, data, size, 1000, GFP_NOIO); } static void async_set_reg_cb(struct urb *urb) { struct async_req *req = (struct async_req *)urb->context; int status = urb->status; if (status < 0) dev_dbg(&urb->dev->dev, "%s failed with %d", __func__, status); kfree(req); usb_free_urb(urb); } static int async_set_registers(rtl8150_t *dev, u16 indx, u16 size, u16 reg) { int res = -ENOMEM; struct urb *async_urb; struct async_req *req; req = kmalloc(sizeof(struct async_req), GFP_ATOMIC); if (req == NULL) return res; async_urb = usb_alloc_urb(0, GFP_ATOMIC); if (async_urb == NULL) { kfree(req); return res; } req->rx_creg = cpu_to_le16(reg); req->dr.bRequestType = RTL8150_REQT_WRITE; req->dr.bRequest = RTL8150_REQ_SET_REGS; req->dr.wIndex = 0; req->dr.wValue = cpu_to_le16(indx); req->dr.wLength = cpu_to_le16(size); usb_fill_control_urb(async_urb, dev->udev, usb_sndctrlpipe(dev->udev, 0), (void *)&req->dr, &req->rx_creg, size, async_set_reg_cb, req); res = usb_submit_urb(async_urb, GFP_ATOMIC); if (res) { if (res == -ENODEV) netif_device_detach(dev->netdev); dev_err(&dev->udev->dev, "%s failed with %d\n", __func__, res); } return res; } static int read_mii_word(rtl8150_t * dev, u8 phy, __u8 indx, u16 * reg) { int i; u8 data[3], tmp; data[0] = phy; data[1] = data[2] = 0; tmp = indx | PHY_READ | PHY_GO; i = 0; set_registers(dev, PHYADD, sizeof(data), data); set_registers(dev, PHYCNT, 1, &tmp); do { get_registers(dev, PHYCNT, 1, data); } while ((data[0] & PHY_GO) && (i++ < MII_TIMEOUT)); if (i <= MII_TIMEOUT) { get_registers(dev, PHYDAT, 2, data); *reg = data[0] | (data[1] << 8); return 0; } else return 1; } static int write_mii_word(rtl8150_t * dev, u8 phy, __u8 indx, u16 reg) { int i; u8 data[3], tmp; data[0] = phy; data[1] = reg & 0xff; data[2] = (reg >> 8) & 0xff; tmp = indx | PHY_WRITE | PHY_GO; i = 0; set_registers(dev, PHYADD, sizeof(data), data); set_registers(dev, PHYCNT, 1, &tmp); do { get_registers(dev, PHYCNT, 1, data); } while ((data[0] & PHY_GO) && (i++ < MII_TIMEOUT)); if (i <= MII_TIMEOUT) return 0; else return 1; } static void set_ethernet_addr(rtl8150_t *dev) { u8 node_id[ETH_ALEN]; int ret; ret = get_registers(dev, IDR, sizeof(node_id), node_id); if (!ret) { eth_hw_addr_set(dev->netdev, node_id); } else { eth_hw_addr_random(dev->netdev); netdev_notice(dev->netdev, "Assigned a random MAC address: %pM\n", dev->netdev->dev_addr); } } static int rtl8150_set_mac_address(struct net_device *netdev, void *p) { struct sockaddr *addr = p; rtl8150_t *dev = netdev_priv(netdev); if (netif_running(netdev)) return -EBUSY; eth_hw_addr_set(netdev, addr->sa_data); netdev_dbg(netdev, "Setting MAC address to %pM\n", netdev->dev_addr); /* Set the IDR registers. */ set_registers(dev, IDR, netdev->addr_len, netdev->dev_addr); #ifdef EEPROM_WRITE { int i; u8 cr; /* Get the CR contents. */ get_registers(dev, CR, 1, &cr); /* Set the WEPROM bit (eeprom write enable). */ cr |= 0x20; set_registers(dev, CR, 1, &cr); /* Write the MAC address into eeprom. Eeprom writes must be word-sized, so we need to split them up. */ for (i = 0; i * 2 < netdev->addr_len; i++) { set_registers(dev, IDR_EEPROM + (i * 2), 2, netdev->dev_addr + (i * 2)); } /* Clear the WEPROM bit (preventing accidental eeprom writes). */ cr &= 0xdf; set_registers(dev, CR, 1, &cr); } #endif return 0; } static int rtl8150_reset(rtl8150_t * dev) { u8 data = 0x10; int i = HZ; set_registers(dev, CR, 1, &data); do { get_registers(dev, CR, 1, &data); } while ((data & 0x10) && --i); return (i > 0) ? 1 : 0; } static int alloc_all_urbs(rtl8150_t * dev) { dev->rx_urb = usb_alloc_urb(0, GFP_KERNEL); if (!dev->rx_urb) return 0; dev->tx_urb = usb_alloc_urb(0, GFP_KERNEL); if (!dev->tx_urb) { usb_free_urb(dev->rx_urb); return 0; } dev->intr_urb = usb_alloc_urb(0, GFP_KERNEL); if (!dev->intr_urb) { usb_free_urb(dev->rx_urb); usb_free_urb(dev->tx_urb); return 0; } return 1; } static void free_all_urbs(rtl8150_t * dev) { usb_free_urb(dev->rx_urb); usb_free_urb(dev->tx_urb); usb_free_urb(dev->intr_urb); } static void unlink_all_urbs(rtl8150_t * dev) { usb_kill_urb(dev->rx_urb); usb_kill_urb(dev->tx_urb); usb_kill_urb(dev->intr_urb); } static inline struct sk_buff *pull_skb(rtl8150_t *dev) { struct sk_buff *skb; int i; for (i = 0; i < RX_SKB_POOL_SIZE; i++) { if (dev->rx_skb_pool[i]) { skb = dev->rx_skb_pool[i]; dev->rx_skb_pool[i] = NULL; return skb; } } return NULL; } static void read_bulk_callback(struct urb *urb) { rtl8150_t *dev; unsigned pkt_len, res; struct sk_buff *skb; struct net_device *netdev; int status = urb->status; int result; unsigned long flags; dev = urb->context; if (!dev) return; if (test_bit(RTL8150_UNPLUG, &dev->flags)) return; netdev = dev->netdev; if (!netif_device_present(netdev)) return; switch (status) { case 0: break; case -ENOENT: return; /* the urb is in unlink state */ case -ETIME: if (printk_ratelimit()) dev_warn(&urb->dev->dev, "may be reset is needed?..\n"); goto goon; default: if (printk_ratelimit()) dev_warn(&urb->dev->dev, "Rx status %d\n", status); goto goon; } if (!dev->rx_skb) goto resched; /* protect against short packets (tell me why we got some?!?) */ if (urb->actual_length < 4) goto goon; res = urb->actual_length; pkt_len = res - 4; skb_put(dev->rx_skb, pkt_len); dev->rx_skb->protocol = eth_type_trans(dev->rx_skb, netdev); netif_rx(dev->rx_skb); netdev->stats.rx_packets++; netdev->stats.rx_bytes += pkt_len; spin_lock_irqsave(&dev->rx_pool_lock, flags); skb = pull_skb(dev); spin_unlock_irqrestore(&dev->rx_pool_lock, flags); if (!skb) goto resched; dev->rx_skb = skb; goon: usb_fill_bulk_urb(dev->rx_urb, dev->udev, usb_rcvbulkpipe(dev->udev, 1), dev->rx_skb->data, RTL8150_MTU, read_bulk_callback, dev); result = usb_submit_urb(dev->rx_urb, GFP_ATOMIC); if (result == -ENODEV) netif_device_detach(dev->netdev); else if (result) { set_bit(RX_URB_FAIL, &dev->flags); goto resched; } else { clear_bit(RX_URB_FAIL, &dev->flags); } return; resched: tasklet_schedule(&dev->tl); } static void write_bulk_callback(struct urb *urb) { rtl8150_t *dev; int status = urb->status; dev = urb->context; if (!dev) return; dev_kfree_skb_irq(dev->tx_skb); if (!netif_device_present(dev->netdev)) return; if (status) dev_info(&urb->dev->dev, "%s: Tx status %d\n", dev->netdev->name, status); netif_trans_update(dev->netdev); netif_wake_queue(dev->netdev); } static void intr_callback(struct urb *urb) { rtl8150_t *dev; __u8 *d; int status = urb->status; int res; dev = urb->context; if (!dev) return; switch (status) { case 0: /* success */ break; case -ECONNRESET: /* unlink */ case -ENOENT: case -ESHUTDOWN: return; /* -EPIPE: should clear the halt */ default: dev_info(&urb->dev->dev, "%s: intr status %d\n", dev->netdev->name, status); goto resubmit; } d = urb->transfer_buffer; if (d[0] & TSR_ERRORS) { dev->netdev->stats.tx_errors++; if (d[INT_TSR] & (TSR_ECOL | TSR_JBR)) dev->netdev->stats.tx_aborted_errors++; if (d[INT_TSR] & TSR_LCOL) dev->netdev->stats.tx_window_errors++; if (d[INT_TSR] & TSR_LOSS_CRS) dev->netdev->stats.tx_carrier_errors++; } /* Report link status changes to the network stack */ if ((d[INT_MSR] & MSR_LINK) == 0) { if (netif_carrier_ok(dev->netdev)) { netif_carrier_off(dev->netdev); netdev_dbg(dev->netdev, "%s: LINK LOST\n", __func__); } } else { if (!netif_carrier_ok(dev->netdev)) { netif_carrier_on(dev->netdev); netdev_dbg(dev->netdev, "%s: LINK CAME BACK\n", __func__); } } resubmit: res = usb_submit_urb (urb, GFP_ATOMIC); if (res == -ENODEV) netif_device_detach(dev->netdev); else if (res) dev_err(&dev->udev->dev, "can't resubmit intr, %s-%s/input0, status %d\n", dev->udev->bus->bus_name, dev->udev->devpath, res); } static int rtl8150_suspend(struct usb_interface *intf, pm_message_t message) { rtl8150_t *dev = usb_get_intfdata(intf); netif_device_detach(dev->netdev); if (netif_running(dev->netdev)) { usb_kill_urb(dev->rx_urb); usb_kill_urb(dev->intr_urb); } return 0; } static int rtl8150_resume(struct usb_interface *intf) { rtl8150_t *dev = usb_get_intfdata(intf); netif_device_attach(dev->netdev); if (netif_running(dev->netdev)) { dev->rx_urb->status = 0; dev->rx_urb->actual_length = 0; read_bulk_callback(dev->rx_urb); dev->intr_urb->status = 0; dev->intr_urb->actual_length = 0; intr_callback(dev->intr_urb); } return 0; } /* ** ** network related part of the code ** */ static void fill_skb_pool(rtl8150_t *dev) { struct sk_buff *skb; int i; for (i = 0; i < RX_SKB_POOL_SIZE; i++) { if (dev->rx_skb_pool[i]) continue; skb = dev_alloc_skb(RTL8150_MTU + 2); if (!skb) { return; } skb_reserve(skb, 2); dev->rx_skb_pool[i] = skb; } } static void free_skb_pool(rtl8150_t *dev) { int i; for (i = 0; i < RX_SKB_POOL_SIZE; i++) dev_kfree_skb(dev->rx_skb_pool[i]); } static void rx_fixup(struct tasklet_struct *t) { struct rtl8150 *dev = from_tasklet(dev, t, tl); struct sk_buff *skb; int status; spin_lock_irq(&dev->rx_pool_lock); fill_skb_pool(dev); spin_unlock_irq(&dev->rx_pool_lock); if (test_bit(RX_URB_FAIL, &dev->flags)) if (dev->rx_skb) goto try_again; spin_lock_irq(&dev->rx_pool_lock); skb = pull_skb(dev); spin_unlock_irq(&dev->rx_pool_lock); if (skb == NULL) goto tlsched; dev->rx_skb = skb; usb_fill_bulk_urb(dev->rx_urb, dev->udev, usb_rcvbulkpipe(dev->udev, 1), dev->rx_skb->data, RTL8150_MTU, read_bulk_callback, dev); try_again: status = usb_submit_urb(dev->rx_urb, GFP_ATOMIC); if (status == -ENODEV) { netif_device_detach(dev->netdev); } else if (status) { set_bit(RX_URB_FAIL, &dev->flags); goto tlsched; } else { clear_bit(RX_URB_FAIL, &dev->flags); } return; tlsched: tasklet_schedule(&dev->tl); } static int enable_net_traffic(rtl8150_t * dev) { u8 cr, tcr, rcr, msr; if (!rtl8150_reset(dev)) { dev_warn(&dev->udev->dev, "device reset failed\n"); } /* RCR bit7=1 attach Rx info at the end; =0 HW CRC (which is broken) */ rcr = 0x9e; tcr = 0xd8; cr = 0x0c; if (!(rcr & 0x80)) set_bit(RTL8150_HW_CRC, &dev->flags); set_registers(dev, RCR, 1, &rcr); set_registers(dev, TCR, 1, &tcr); set_registers(dev, CR, 1, &cr); get_registers(dev, MSR, 1, &msr); return 0; } static void disable_net_traffic(rtl8150_t * dev) { u8 cr; get_registers(dev, CR, 1, &cr); cr &= 0xf3; set_registers(dev, CR, 1, &cr); } static void rtl8150_tx_timeout(struct net_device *netdev, unsigned int txqueue) { rtl8150_t *dev = netdev_priv(netdev); dev_warn(&netdev->dev, "Tx timeout.\n"); usb_unlink_urb(dev->tx_urb); netdev->stats.tx_errors++; } static void rtl8150_set_multicast(struct net_device *netdev) { rtl8150_t *dev = netdev_priv(netdev); u16 rx_creg = 0x9e; netif_stop_queue(netdev); if (netdev->flags & IFF_PROMISC) { rx_creg |= 0x0001; dev_info(&netdev->dev, "%s: promiscuous mode\n", netdev->name); } else if (!netdev_mc_empty(netdev) || (netdev->flags & IFF_ALLMULTI)) { rx_creg &= 0xfffe; rx_creg |= 0x0002; dev_dbg(&netdev->dev, "%s: allmulti set\n", netdev->name); } else { /* ~RX_MULTICAST, ~RX_PROMISCUOUS */ rx_creg &= 0x00fc; } async_set_registers(dev, RCR, sizeof(rx_creg), rx_creg); netif_wake_queue(netdev); } static netdev_tx_t rtl8150_start_xmit(struct sk_buff *skb, struct net_device *netdev) { rtl8150_t *dev = netdev_priv(netdev); int count, res; netif_stop_queue(netdev); count = (skb->len < 60) ? 60 : skb->len; count = (count & 0x3f) ? count : count + 1; dev->tx_skb = skb; usb_fill_bulk_urb(dev->tx_urb, dev->udev, usb_sndbulkpipe(dev->udev, 2), skb->data, count, write_bulk_callback, dev); if ((res = usb_submit_urb(dev->tx_urb, GFP_ATOMIC))) { /* Can we get/handle EPIPE here? */ if (res == -ENODEV) netif_device_detach(dev->netdev); else { dev_warn(&netdev->dev, "failed tx_urb %d\n", res); netdev->stats.tx_errors++; netif_start_queue(netdev); } } else { netdev->stats.tx_packets++; netdev->stats.tx_bytes += skb->len; netif_trans_update(netdev); } return NETDEV_TX_OK; } static void set_carrier(struct net_device *netdev) { rtl8150_t *dev = netdev_priv(netdev); short tmp; get_registers(dev, CSCR, 2, &tmp); if (tmp & CSCR_LINK_STATUS) netif_carrier_on(netdev); else netif_carrier_off(netdev); } static int rtl8150_open(struct net_device *netdev) { rtl8150_t *dev = netdev_priv(netdev); int res; if (dev->rx_skb == NULL) dev->rx_skb = pull_skb(dev); if (!dev->rx_skb) return -ENOMEM; set_registers(dev, IDR, 6, netdev->dev_addr); usb_fill_bulk_urb(dev->rx_urb, dev->udev, usb_rcvbulkpipe(dev->udev, 1), dev->rx_skb->data, RTL8150_MTU, read_bulk_callback, dev); if ((res = usb_submit_urb(dev->rx_urb, GFP_KERNEL))) { if (res == -ENODEV) netif_device_detach(dev->netdev); dev_warn(&netdev->dev, "rx_urb submit failed: %d\n", res); return res; } usb_fill_int_urb(dev->intr_urb, dev->udev, usb_rcvintpipe(dev->udev, 3), dev->intr_buff, INTBUFSIZE, intr_callback, dev, dev->intr_interval); if ((res = usb_submit_urb(dev->intr_urb, GFP_KERNEL))) { if (res == -ENODEV) netif_device_detach(dev->netdev); dev_warn(&netdev->dev, "intr_urb submit failed: %d\n", res); usb_kill_urb(dev->rx_urb); return res; } enable_net_traffic(dev); set_carrier(netdev); netif_start_queue(netdev); return res; } static int rtl8150_close(struct net_device *netdev) { rtl8150_t *dev = netdev_priv(netdev); netif_stop_queue(netdev); if (!test_bit(RTL8150_UNPLUG, &dev->flags)) disable_net_traffic(dev); unlink_all_urbs(dev); return 0; } static void rtl8150_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { rtl8150_t *dev = netdev_priv(netdev); strscpy(info->driver, driver_name, sizeof(info->driver)); strscpy(info->version, DRIVER_VERSION, sizeof(info->version)); usb_make_path(dev->udev, info->bus_info, sizeof(info->bus_info)); } static int rtl8150_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *ecmd) { rtl8150_t *dev = netdev_priv(netdev); short lpa, bmcr; u32 supported; supported = (SUPPORTED_10baseT_Half | SUPPORTED_10baseT_Full | SUPPORTED_100baseT_Half | SUPPORTED_100baseT_Full | SUPPORTED_Autoneg | SUPPORTED_TP | SUPPORTED_MII); ecmd->base.port = PORT_TP; ecmd->base.phy_address = dev->phy; get_registers(dev, BMCR, 2, &bmcr); get_registers(dev, ANLP, 2, &lpa); if (bmcr & BMCR_ANENABLE) { u32 speed = ((lpa & (LPA_100HALF | LPA_100FULL)) ? SPEED_100 : SPEED_10); ecmd->base.speed = speed; ecmd->base.autoneg = AUTONEG_ENABLE; if (speed == SPEED_100) ecmd->base.duplex = (lpa & LPA_100FULL) ? DUPLEX_FULL : DUPLEX_HALF; else ecmd->base.duplex = (lpa & LPA_10FULL) ? DUPLEX_FULL : DUPLEX_HALF; } else { ecmd->base.autoneg = AUTONEG_DISABLE; ecmd->base.speed = ((bmcr & BMCR_SPEED100) ? SPEED_100 : SPEED_10); ecmd->base.duplex = (bmcr & BMCR_FULLDPLX) ? DUPLEX_FULL : DUPLEX_HALF; } ethtool_convert_legacy_u32_to_link_mode(ecmd->link_modes.supported, supported); return 0; } static const struct ethtool_ops ops = { .get_drvinfo = rtl8150_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = rtl8150_get_link_ksettings, }; static int rtl8150_siocdevprivate(struct net_device *netdev, struct ifreq *rq, void __user *udata, int cmd) { rtl8150_t *dev = netdev_priv(netdev); u16 *data = (u16 *) & rq->ifr_ifru; int res = 0; switch (cmd) { case SIOCDEVPRIVATE: data[0] = dev->phy; fallthrough; case SIOCDEVPRIVATE + 1: read_mii_word(dev, dev->phy, (data[1] & 0x1f), &data[3]); break; case SIOCDEVPRIVATE + 2: if (!capable(CAP_NET_ADMIN)) return -EPERM; write_mii_word(dev, dev->phy, (data[1] & 0x1f), data[2]); break; default: res = -EOPNOTSUPP; } return res; } static const struct net_device_ops rtl8150_netdev_ops = { .ndo_open = rtl8150_open, .ndo_stop = rtl8150_close, .ndo_siocdevprivate = rtl8150_siocdevprivate, .ndo_start_xmit = rtl8150_start_xmit, .ndo_tx_timeout = rtl8150_tx_timeout, .ndo_set_rx_mode = rtl8150_set_multicast, .ndo_set_mac_address = rtl8150_set_mac_address, .ndo_validate_addr = eth_validate_addr, }; static int rtl8150_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); rtl8150_t *dev; struct net_device *netdev; netdev = alloc_etherdev(sizeof(rtl8150_t)); if (!netdev) return -ENOMEM; dev = netdev_priv(netdev); dev->intr_buff = kmalloc(INTBUFSIZE, GFP_KERNEL); if (!dev->intr_buff) { free_netdev(netdev); return -ENOMEM; } tasklet_setup(&dev->tl, rx_fixup); spin_lock_init(&dev->rx_pool_lock); dev->udev = udev; dev->netdev = netdev; netdev->netdev_ops = &rtl8150_netdev_ops; netdev->watchdog_timeo = RTL8150_TX_TIMEOUT; netdev->ethtool_ops = &ops; dev->intr_interval = 100; /* 100ms */ if (!alloc_all_urbs(dev)) { dev_err(&intf->dev, "out of memory\n"); goto out; } if (!rtl8150_reset(dev)) { dev_err(&intf->dev, "couldn't reset the device\n"); goto out1; } fill_skb_pool(dev); set_ethernet_addr(dev); usb_set_intfdata(intf, dev); SET_NETDEV_DEV(netdev, &intf->dev); if (register_netdev(netdev) != 0) { dev_err(&intf->dev, "couldn't register the device\n"); goto out2; } dev_info(&intf->dev, "%s: rtl8150 is detected\n", netdev->name); return 0; out2: usb_set_intfdata(intf, NULL); free_skb_pool(dev); out1: free_all_urbs(dev); out: kfree(dev->intr_buff); free_netdev(netdev); return -EIO; } static void rtl8150_disconnect(struct usb_interface *intf) { rtl8150_t *dev = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); if (dev) { set_bit(RTL8150_UNPLUG, &dev->flags); tasklet_kill(&dev->tl); unregister_netdev(dev->netdev); unlink_all_urbs(dev); free_all_urbs(dev); free_skb_pool(dev); dev_kfree_skb(dev->rx_skb); kfree(dev->intr_buff); free_netdev(dev->netdev); } } static struct usb_driver rtl8150_driver = { .name = driver_name, .probe = rtl8150_probe, .disconnect = rtl8150_disconnect, .id_table = rtl8150_table, .suspend = rtl8150_suspend, .resume = rtl8150_resume, .disable_hub_initiated_lpm = 1, }; module_usb_driver(rtl8150_driver); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
9 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 #ifndef __DRM_GEM_H__ #define __DRM_GEM_H__ /* * GEM Graphics Execution Manager Driver Interfaces * * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * Copyright (c) 2009-2010, Code Aurora Forum. * All rights reserved. * Copyright © 2014 Intel Corporation * Daniel Vetter <daniel.vetter@ffwll.ch> * * Author: Rickard E. (Rik) Faith <faith@valinux.com> * Author: Gareth Hughes <gareth@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/kref.h> #include <linux/dma-resv.h> #include <linux/list.h> #include <linux/mutex.h> #include <drm/drm_vma_manager.h> struct iosys_map; struct drm_gem_object; /** * enum drm_gem_object_status - bitmask of object state for fdinfo reporting * @DRM_GEM_OBJECT_RESIDENT: object is resident in memory (ie. not unpinned) * @DRM_GEM_OBJECT_PURGEABLE: object marked as purgeable by userspace * * Bitmask of status used for fdinfo memory stats, see &drm_gem_object_funcs.status * and drm_show_fdinfo(). Note that an object can DRM_GEM_OBJECT_PURGEABLE if * it still active or not resident, in which case drm_show_fdinfo() will not * account for it as purgeable. So drivers do not need to check if the buffer * is idle and resident to return this bit. (Ie. userspace can mark a buffer * as purgeable even while it is still busy on the GPU.. it does not _actually_ * become puregeable until it becomes idle. The status gem object func does * not need to consider this.) */ enum drm_gem_object_status { DRM_GEM_OBJECT_RESIDENT = BIT(0), DRM_GEM_OBJECT_PURGEABLE = BIT(1), }; /** * struct drm_gem_object_funcs - GEM object functions */ struct drm_gem_object_funcs { /** * @free: * * Deconstructor for drm_gem_objects. * * This callback is mandatory. */ void (*free)(struct drm_gem_object *obj); /** * @open: * * Called upon GEM handle creation. * * This callback is optional. */ int (*open)(struct drm_gem_object *obj, struct drm_file *file); /** * @close: * * Called upon GEM handle release. * * This callback is optional. */ void (*close)(struct drm_gem_object *obj, struct drm_file *file); /** * @print_info: * * If driver subclasses struct &drm_gem_object, it can implement this * optional hook for printing additional driver specific info. * * drm_printf_indent() should be used in the callback passing it the * indent argument. * * This callback is called from drm_gem_print_info(). * * This callback is optional. */ void (*print_info)(struct drm_printer *p, unsigned int indent, const struct drm_gem_object *obj); /** * @export: * * Export backing buffer as a &dma_buf. * If this is not set drm_gem_prime_export() is used. * * This callback is optional. */ struct dma_buf *(*export)(struct drm_gem_object *obj, int flags); /** * @pin: * * Pin backing buffer in memory. Used by the drm_gem_map_attach() helper. * * This callback is optional. */ int (*pin)(struct drm_gem_object *obj); /** * @unpin: * * Unpin backing buffer. Used by the drm_gem_map_detach() helper. * * This callback is optional. */ void (*unpin)(struct drm_gem_object *obj); /** * @get_sg_table: * * Returns a Scatter-Gather table representation of the buffer. * Used when exporting a buffer by the drm_gem_map_dma_buf() helper. * Releasing is done by calling dma_unmap_sg_attrs() and sg_free_table() * in drm_gem_unmap_buf(), therefore these helpers and this callback * here cannot be used for sg tables pointing at driver private memory * ranges. * * See also drm_prime_pages_to_sg(). */ struct sg_table *(*get_sg_table)(struct drm_gem_object *obj); /** * @vmap: * * Returns a virtual address for the buffer. Used by the * drm_gem_dmabuf_vmap() helper. * * This callback is optional. */ int (*vmap)(struct drm_gem_object *obj, struct iosys_map *map); /** * @vunmap: * * Releases the address previously returned by @vmap. Used by the * drm_gem_dmabuf_vunmap() helper. * * This callback is optional. */ void (*vunmap)(struct drm_gem_object *obj, struct iosys_map *map); /** * @mmap: * * Handle mmap() of the gem object, setup vma accordingly. * * This callback is optional. * * The callback is used by both drm_gem_mmap_obj() and * drm_gem_prime_mmap(). When @mmap is present @vm_ops is not * used, the @mmap callback must set vma->vm_ops instead. */ int (*mmap)(struct drm_gem_object *obj, struct vm_area_struct *vma); /** * @evict: * * Evicts gem object out from memory. Used by the drm_gem_object_evict() * helper. Returns 0 on success, -errno otherwise. * * This callback is optional. */ int (*evict)(struct drm_gem_object *obj); /** * @status: * * The optional status callback can return additional object state * which determines which stats the object is counted against. The * callback is called under table_lock. Racing against object status * change is "harmless", and the callback can expect to not race * against object destruction. * * Called by drm_show_memory_stats(). */ enum drm_gem_object_status (*status)(struct drm_gem_object *obj); /** * @rss: * * Return resident size of the object in physical memory. * * Called by drm_show_memory_stats(). */ size_t (*rss)(struct drm_gem_object *obj); /** * @vm_ops: * * Virtual memory operations used with mmap. * * This is optional but necessary for mmap support. */ const struct vm_operations_struct *vm_ops; }; /** * struct drm_gem_lru - A simple LRU helper * * A helper for tracking GEM objects in a given state, to aid in * driver's shrinker implementation. Tracks the count of pages * for lockless &shrinker.count_objects, and provides * &drm_gem_lru_scan for driver's &shrinker.scan_objects * implementation. */ struct drm_gem_lru { /** * @lock: * * Lock protecting movement of GEM objects between LRUs. All * LRUs that the object can move between should be protected * by the same lock. */ struct mutex *lock; /** * @count: * * The total number of backing pages of the GEM objects in * this LRU. */ long count; /** * @list: * * The LRU list. */ struct list_head list; }; /** * struct drm_gem_object - GEM buffer object * * This structure defines the generic parts for GEM buffer objects, which are * mostly around handling mmap and userspace handles. * * Buffer objects are often abbreviated to BO. */ struct drm_gem_object { /** * @refcount: * * Reference count of this object * * Please use drm_gem_object_get() to acquire and drm_gem_object_put_locked() * or drm_gem_object_put() to release a reference to a GEM * buffer object. */ struct kref refcount; /** * @handle_count: * * This is the GEM file_priv handle count of this object. * * Each handle also holds a reference. Note that when the handle_count * drops to 0 any global names (e.g. the id in the flink namespace) will * be cleared. * * Protected by &drm_device.object_name_lock. */ unsigned handle_count; /** * @dev: DRM dev this object belongs to. */ struct drm_device *dev; /** * @filp: * * SHMEM file node used as backing storage for swappable buffer objects. * GEM also supports driver private objects with driver-specific backing * storage (contiguous DMA memory, special reserved blocks). In this * case @filp is NULL. */ struct file *filp; /** * @vma_node: * * Mapping info for this object to support mmap. Drivers are supposed to * allocate the mmap offset using drm_gem_create_mmap_offset(). The * offset itself can be retrieved using drm_vma_node_offset_addr(). * * Memory mapping itself is handled by drm_gem_mmap(), which also checks * that userspace is allowed to access the object. */ struct drm_vma_offset_node vma_node; /** * @size: * * Size of the object, in bytes. Immutable over the object's * lifetime. */ size_t size; /** * @name: * * Global name for this object, starts at 1. 0 means unnamed. * Access is covered by &drm_device.object_name_lock. This is used by * the GEM_FLINK and GEM_OPEN ioctls. */ int name; /** * @dma_buf: * * dma-buf associated with this GEM object. * * Pointer to the dma-buf associated with this gem object (either * through importing or exporting). We break the resulting reference * loop when the last gem handle for this object is released. * * Protected by &drm_device.object_name_lock. */ struct dma_buf *dma_buf; /** * @import_attach: * * dma-buf attachment backing this object. * * Any foreign dma_buf imported as a gem object has this set to the * attachment point for the device. This is invariant over the lifetime * of a gem object. * * The &drm_gem_object_funcs.free callback is responsible for * cleaning up the dma_buf attachment and references acquired at import * time. * * Note that the drm gem/prime core does not depend upon drivers setting * this field any more. So for drivers where this doesn't make sense * (e.g. virtual devices or a displaylink behind an usb bus) they can * simply leave it as NULL. */ struct dma_buf_attachment *import_attach; /** * @resv: * * Pointer to reservation object associated with the this GEM object. * * Normally (@resv == &@_resv) except for imported GEM objects. */ struct dma_resv *resv; /** * @_resv: * * A reservation object for this GEM object. * * This is unused for imported GEM objects. */ struct dma_resv _resv; /** * @gpuva: * * Provides the list of GPU VAs attached to this GEM object. * * Drivers should lock list accesses with the GEMs &dma_resv lock * (&drm_gem_object.resv) or a custom lock if one is provided. */ struct { struct list_head list; #ifdef CONFIG_LOCKDEP struct lockdep_map *lock_dep_map; #endif } gpuva; /** * @funcs: * * Optional GEM object functions. If this is set, it will be used instead of the * corresponding &drm_driver GEM callbacks. * * New drivers should use this. * */ const struct drm_gem_object_funcs *funcs; /** * @lru_node: * * List node in a &drm_gem_lru. */ struct list_head lru_node; /** * @lru: * * The current LRU list that the GEM object is on. */ struct drm_gem_lru *lru; }; /** * DRM_GEM_FOPS - Default drm GEM file operations * * This macro provides a shorthand for setting the GEM file ops in the * &file_operations structure. If all you need are the default ops, use * DEFINE_DRM_GEM_FOPS instead. */ #define DRM_GEM_FOPS \ .open = drm_open,\ .release = drm_release,\ .unlocked_ioctl = drm_ioctl,\ .compat_ioctl = drm_compat_ioctl,\ .poll = drm_poll,\ .read = drm_read,\ .llseek = noop_llseek,\ .mmap = drm_gem_mmap /** * DEFINE_DRM_GEM_FOPS() - macro to generate file operations for GEM drivers * @name: name for the generated structure * * This macro autogenerates a suitable &struct file_operations for GEM based * drivers, which can be assigned to &drm_driver.fops. Note that this structure * cannot be shared between drivers, because it contains a reference to the * current module using THIS_MODULE. * * Note that the declaration is already marked as static - if you need a * non-static version of this you're probably doing it wrong and will break the * THIS_MODULE reference by accident. */ #define DEFINE_DRM_GEM_FOPS(name) \ static const struct file_operations name = {\ .owner = THIS_MODULE,\ DRM_GEM_FOPS,\ } void drm_gem_object_release(struct drm_gem_object *obj); void drm_gem_object_free(struct kref *kref); int drm_gem_object_init(struct drm_device *dev, struct drm_gem_object *obj, size_t size); void drm_gem_private_object_init(struct drm_device *dev, struct drm_gem_object *obj, size_t size); void drm_gem_private_object_fini(struct drm_gem_object *obj); void drm_gem_vm_open(struct vm_area_struct *vma); void drm_gem_vm_close(struct vm_area_struct *vma); int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size, struct vm_area_struct *vma); int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma); /** * drm_gem_object_get - acquire a GEM buffer object reference * @obj: GEM buffer object * * This function acquires an additional reference to @obj. It is illegal to * call this without already holding a reference. No locks required. */ static inline void drm_gem_object_get(struct drm_gem_object *obj) { kref_get(&obj->refcount); } __attribute__((nonnull)) static inline void __drm_gem_object_put(struct drm_gem_object *obj) { kref_put(&obj->refcount, drm_gem_object_free); } /** * drm_gem_object_put - drop a GEM buffer object reference * @obj: GEM buffer object * * This releases a reference to @obj. */ static inline void drm_gem_object_put(struct drm_gem_object *obj) { if (obj) __drm_gem_object_put(obj); } int drm_gem_handle_create(struct drm_file *file_priv, struct drm_gem_object *obj, u32 *handlep); int drm_gem_handle_delete(struct drm_file *filp, u32 handle); void drm_gem_free_mmap_offset(struct drm_gem_object *obj); int drm_gem_create_mmap_offset(struct drm_gem_object *obj); int drm_gem_create_mmap_offset_size(struct drm_gem_object *obj, size_t size); struct page **drm_gem_get_pages(struct drm_gem_object *obj); void drm_gem_put_pages(struct drm_gem_object *obj, struct page **pages, bool dirty, bool accessed); int drm_gem_vmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map); void drm_gem_vunmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map); int drm_gem_objects_lookup(struct drm_file *filp, void __user *bo_handles, int count, struct drm_gem_object ***objs_out); struct drm_gem_object *drm_gem_object_lookup(struct drm_file *filp, u32 handle); long drm_gem_dma_resv_wait(struct drm_file *filep, u32 handle, bool wait_all, unsigned long timeout); int drm_gem_lock_reservations(struct drm_gem_object **objs, int count, struct ww_acquire_ctx *acquire_ctx); void drm_gem_unlock_reservations(struct drm_gem_object **objs, int count, struct ww_acquire_ctx *acquire_ctx); int drm_gem_dumb_map_offset(struct drm_file *file, struct drm_device *dev, u32 handle, u64 *offset); void drm_gem_lru_init(struct drm_gem_lru *lru, struct mutex *lock); void drm_gem_lru_remove(struct drm_gem_object *obj); void drm_gem_lru_move_tail_locked(struct drm_gem_lru *lru, struct drm_gem_object *obj); void drm_gem_lru_move_tail(struct drm_gem_lru *lru, struct drm_gem_object *obj); unsigned long drm_gem_lru_scan(struct drm_gem_lru *lru, unsigned int nr_to_scan, unsigned long *remaining, bool (*shrink)(struct drm_gem_object *obj)); int drm_gem_evict(struct drm_gem_object *obj); #ifdef CONFIG_LOCKDEP /** * drm_gem_gpuva_set_lock() - Set the lock protecting accesses to the gpuva list. * @obj: the &drm_gem_object * @lock: the lock used to protect the gpuva list. The locking primitive * must contain a dep_map field. * * Call this if you're not proctecting access to the gpuva list with the * dma-resv lock, but with a custom lock. */ #define drm_gem_gpuva_set_lock(obj, lock) \ if (!WARN((obj)->gpuva.lock_dep_map, \ "GEM GPUVA lock should be set only once.")) \ (obj)->gpuva.lock_dep_map = &(lock)->dep_map #define drm_gem_gpuva_assert_lock_held(obj) \ lockdep_assert((obj)->gpuva.lock_dep_map ? \ lock_is_held((obj)->gpuva.lock_dep_map) : \ dma_resv_held((obj)->resv)) #else #define drm_gem_gpuva_set_lock(obj, lock) do {} while (0) #define drm_gem_gpuva_assert_lock_held(obj) do {} while (0) #endif /** * drm_gem_gpuva_init() - initialize the gpuva list of a GEM object * @obj: the &drm_gem_object * * This initializes the &drm_gem_object's &drm_gpuvm_bo list. * * Calling this function is only necessary for drivers intending to support the * &drm_driver_feature DRIVER_GEM_GPUVA. * * See also drm_gem_gpuva_set_lock(). */ static inline void drm_gem_gpuva_init(struct drm_gem_object *obj) { INIT_LIST_HEAD(&obj->gpuva.list); } /** * drm_gem_for_each_gpuvm_bo() - iterator to walk over a list of &drm_gpuvm_bo * @entry__: &drm_gpuvm_bo structure to assign to in each iteration step * @obj__: the &drm_gem_object the &drm_gpuvm_bo to walk are associated with * * This iterator walks over all &drm_gpuvm_bo structures associated with the * &drm_gem_object. */ #define drm_gem_for_each_gpuvm_bo(entry__, obj__) \ list_for_each_entry(entry__, &(obj__)->gpuva.list, list.entry.gem) /** * drm_gem_for_each_gpuvm_bo_safe() - iterator to safely walk over a list of * &drm_gpuvm_bo * @entry__: &drm_gpuvm_bostructure to assign to in each iteration step * @next__: &next &drm_gpuvm_bo to store the next step * @obj__: the &drm_gem_object the &drm_gpuvm_bo to walk are associated with * * This iterator walks over all &drm_gpuvm_bo structures associated with the * &drm_gem_object. It is implemented with list_for_each_entry_safe(), hence * it is save against removal of elements. */ #define drm_gem_for_each_gpuvm_bo_safe(entry__, next__, obj__) \ list_for_each_entry_safe(entry__, next__, &(obj__)->gpuva.list, list.entry.gem) #endif /* __DRM_GEM_H__ */
72 58 17 69 3 70 2 63 9 47 15 27 3 1 1 22 2 7 2 1 1 1 2 21 6 3 1 2 4 30 16 15 2 1 1 2 10 175 86 91 120 72 37 7 3 21 7 42 2 2 17 12 1 7 16 3 9 2 6 3 3 2 1 4 4 3 4 1 5 7 6 4 9 10 8 8 10 13 59 59 34 26 8 7 1 8 5 1 1 1 3 116 92 1 15 76 62 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The options processing module for ip.c * * Authors: A.N.Kuznetsov * */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/uaccess.h> #include <asm/unaligned.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/cipso_ipv4.h> #include <net/ip_fib.h> /* * Write options to IP header, record destination address to * source route option, address of outgoing interface * (we should already know it, so that this function is allowed be * called only after routing decision) and timestamp, * if we originate this datagram. * * daddr is real destination address, next hop is recorded in IP header. * saddr is address of outgoing interface. */ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt) { unsigned char *iph = skb_network_header(skb); memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen); opt = &(IPCB(skb)->opt); if (opt->srr) memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4); if (opt->rr_needaddr) ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt); if (opt->ts_needaddr) ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt); if (opt->ts_needtime) { __be32 midtime; midtime = inet_current_timestamp(); memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4); } } /* * Provided (sopt, skb) points to received options, * build in dopt compiled option set appropriate for answering. * i.e. invert SRR option, copy anothers, * and grab room in RR/TS options. * * NOTE: dopt cannot point to skb. */ int __ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb, const struct ip_options *sopt) { unsigned char *sptr, *dptr; int soffset, doffset; int optlen; memset(dopt, 0, sizeof(struct ip_options)); if (sopt->optlen == 0) return 0; sptr = skb_network_header(skb); dptr = dopt->__data; if (sopt->rr) { optlen = sptr[sopt->rr+1]; soffset = sptr[sopt->rr+2]; dopt->rr = dopt->optlen + sizeof(struct iphdr); memcpy(dptr, sptr+sopt->rr, optlen); if (sopt->rr_needaddr && soffset <= optlen) { if (soffset + 3 > optlen) return -EINVAL; dptr[2] = soffset + 4; dopt->rr_needaddr = 1; } dptr += optlen; dopt->optlen += optlen; } if (sopt->ts) { optlen = sptr[sopt->ts+1]; soffset = sptr[sopt->ts+2]; dopt->ts = dopt->optlen + sizeof(struct iphdr); memcpy(dptr, sptr+sopt->ts, optlen); if (soffset <= optlen) { if (sopt->ts_needaddr) { if (soffset + 3 > optlen) return -EINVAL; dopt->ts_needaddr = 1; soffset += 4; } if (sopt->ts_needtime) { if (soffset + 3 > optlen) return -EINVAL; if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) { dopt->ts_needtime = 1; soffset += 4; } else { dopt->ts_needtime = 0; if (soffset + 7 <= optlen) { __be32 addr; memcpy(&addr, dptr+soffset-1, 4); if (inet_addr_type(net, addr) != RTN_UNICAST) { dopt->ts_needtime = 1; soffset += 8; } } } } dptr[2] = soffset; } dptr += optlen; dopt->optlen += optlen; } if (sopt->srr) { unsigned char *start = sptr+sopt->srr; __be32 faddr; optlen = start[1]; soffset = start[2]; doffset = 0; if (soffset > optlen) soffset = optlen + 1; soffset -= 4; if (soffset > 3) { memcpy(&faddr, &start[soffset-1], 4); for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4) memcpy(&dptr[doffset-1], &start[soffset-1], 4); /* * RFC1812 requires to fix illegal source routes. */ if (memcmp(&ip_hdr(skb)->saddr, &start[soffset + 3], 4) == 0) doffset -= 4; } if (doffset > 3) { dopt->faddr = faddr; dptr[0] = start[0]; dptr[1] = doffset+3; dptr[2] = 4; dptr += doffset+3; dopt->srr = dopt->optlen + sizeof(struct iphdr); dopt->optlen += doffset+3; dopt->is_strictroute = sopt->is_strictroute; } } if (sopt->cipso) { optlen = sptr[sopt->cipso+1]; dopt->cipso = dopt->optlen+sizeof(struct iphdr); memcpy(dptr, sptr+sopt->cipso, optlen); dptr += optlen; dopt->optlen += optlen; } while (dopt->optlen & 3) { *dptr++ = IPOPT_END; dopt->optlen++; } return 0; } /* * Options "fragmenting", just fill options not * allowed in fragments with NOOPs. * Simple and stupid 8), but the most efficient way. */ void ip_options_fragment(struct sk_buff *skb) { unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); struct ip_options *opt = &(IPCB(skb)->opt); int l = opt->optlen; int optlen; while (l > 0) { switch (*optptr) { case IPOPT_END: return; case IPOPT_NOOP: l--; optptr++; continue; } optlen = optptr[1]; if (optlen < 2 || optlen > l) return; if (!IPOPT_COPIED(*optptr)) memset(optptr, IPOPT_NOOP, optlen); l -= optlen; optptr += optlen; } opt->ts = 0; opt->rr = 0; opt->rr_needaddr = 0; opt->ts_needaddr = 0; opt->ts_needtime = 0; } /* helper used by ip_options_compile() to call fib_compute_spec_dst() * at most one time. */ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) { if (*spec_dst == htonl(INADDR_ANY)) *spec_dst = fib_compute_spec_dst(skb); } /* * Verify options and fill pointers in struct options. * Caller should clear *opt, and set opt->data. * If opt == NULL, then skb->data should point to IP header. */ int __ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb, __be32 *info) { __be32 spec_dst = htonl(INADDR_ANY); unsigned char *pp_ptr = NULL; struct rtable *rt = NULL; unsigned char *optptr; unsigned char *iph; int optlen, l; if (skb) { rt = skb_rtable(skb); optptr = (unsigned char *)&(ip_hdr(skb)[1]); } else optptr = opt->__data; iph = optptr - sizeof(struct iphdr); for (l = opt->optlen; l > 0; ) { switch (*optptr) { case IPOPT_END: for (optptr++, l--; l > 0; optptr++, l--) { if (*optptr != IPOPT_END) { *optptr = IPOPT_END; opt->is_changed = 1; } } goto eol; case IPOPT_NOOP: l--; optptr++; continue; } if (unlikely(l < 2)) { pp_ptr = optptr; goto error; } optlen = optptr[1]; if (optlen < 2 || optlen > l) { pp_ptr = optptr; goto error; } switch (*optptr) { case IPOPT_SSRR: case IPOPT_LSRR: if (optlen < 3) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 4) { pp_ptr = optptr + 2; goto error; } /* NB: cf RFC-1812 5.2.4.1 */ if (opt->srr) { pp_ptr = optptr; goto error; } if (!skb) { if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) { pp_ptr = optptr + 1; goto error; } memcpy(&opt->faddr, &optptr[3], 4); if (optlen > 7) memmove(&optptr[3], &optptr[7], optlen-7); } opt->is_strictroute = (optptr[0] == IPOPT_SSRR); opt->srr = optptr - iph; break; case IPOPT_RR: if (opt->rr) { pp_ptr = optptr; goto error; } if (optlen < 3) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 4) { pp_ptr = optptr + 2; goto error; } if (optptr[2] <= optlen) { if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); opt->is_changed = 1; } optptr[2] += 4; opt->rr_needaddr = 1; } opt->rr = optptr - iph; break; case IPOPT_TIMESTAMP: if (opt->ts) { pp_ptr = optptr; goto error; } if (optlen < 4) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 5) { pp_ptr = optptr + 2; goto error; } if (optptr[2] <= optlen) { unsigned char *timeptr = NULL; if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } switch (optptr[3]&0xF) { case IPOPT_TS_TSONLY: if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; optptr[2] += 4; break; case IPOPT_TS_TSANDADDR: if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); timeptr = &optptr[optptr[2]+3]; } opt->ts_needaddr = 1; opt->ts_needtime = 1; optptr[2] += 8; break; case IPOPT_TS_PRESPEC: if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); if (inet_addr_type(net, addr) == RTN_UNICAST) break; if (skb) timeptr = &optptr[optptr[2]+3]; } opt->ts_needtime = 1; optptr[2] += 8; break; default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr + 3; goto error; } break; } if (timeptr) { __be32 midtime; midtime = inet_current_timestamp(); memcpy(timeptr, &midtime, 4); opt->is_changed = 1; } } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { unsigned int overflow = optptr[3]>>4; if (overflow == 15) { pp_ptr = optptr + 3; goto error; } if (skb) { optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } opt->ts = optptr - iph; break; case IPOPT_RA: if (optlen < 4) { pp_ptr = optptr + 1; goto error; } if (optptr[2] == 0 && optptr[3] == 0) opt->router_alert = optptr - iph; break; case IPOPT_CIPSO: if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { pp_ptr = optptr; goto error; } opt->cipso = optptr - iph; if (cipso_v4_validate(skb, &optptr)) { pp_ptr = optptr; goto error; } break; case IPOPT_SEC: case IPOPT_SID: default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr; goto error; } break; } l -= optlen; optptr += optlen; } eol: if (!pp_ptr) return 0; error: if (info) *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } EXPORT_SYMBOL(__ip_options_compile); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) { int ret; __be32 info; ret = __ip_options_compile(net, opt, skb, &info); if (ret != 0 && skb) icmp_send(skb, ICMP_PARAMETERPROB, 0, info); return ret; } EXPORT_SYMBOL(ip_options_compile); /* * Undo all the changes done by ip_options_compile(). */ void ip_options_undo(struct ip_options *opt) { if (opt->srr) { unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr); memmove(optptr + 7, optptr + 3, optptr[1] - 7); memcpy(optptr + 3, &opt->faddr, 4); } if (opt->rr_needaddr) { unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr); optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); } if (opt->ts) { unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr); if (opt->ts_needtime) { optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC) optptr[2] -= 4; } if (opt->ts_needaddr) { optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); } } } int ip_options_get(struct net *net, struct ip_options_rcu **optp, sockptr_t data, int optlen) { struct ip_options_rcu *opt; opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), GFP_KERNEL); if (!opt) return -ENOMEM; if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) { kfree(opt); return -EFAULT; } while (optlen & 3) opt->opt.__data[optlen++] = IPOPT_END; opt->opt.optlen = optlen; if (optlen && ip_options_compile(net, &opt->opt, NULL)) { kfree(opt); return -EINVAL; } kfree(*optp); *optp = opt; return 0; } void ip_forward_options(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); unsigned char *optptr; struct rtable *rt = skb_rtable(skb); unsigned char *raw = skb_network_header(skb); if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; ip_rt_get_source(&optptr[optptr[2]-5], skb, rt); opt->is_changed = 1; } if (opt->srr_is_hit) { int srrptr, srrspace; optptr = raw + opt->srr; for ( srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) { if (srrptr + 3 > srrspace) break; if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0) break; } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; ip_hdr(skb)->daddr = opt->nexthop; ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; } else { net_crit_ratelimited("%s(): Argh! Destination lost!\n", __func__); } if (opt->ts_needaddr) { optptr = raw + opt->ts; ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); opt->is_changed = 1; } } if (opt->is_changed) { opt->is_changed = 0; ip_send_check(ip_hdr(skb)); } } int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt = &(IPCB(skb)->opt); int srrspace, srrptr; __be32 nexthop; struct iphdr *iph = ip_hdr(skb); unsigned char *optptr = skb_network_header(skb) + opt->srr; struct rtable *rt = skb_rtable(skb); struct rtable *rt2; unsigned long orefdst; int err; if (!rt) return 0; if (skb->pkt_type != PACKET_HOST) return -EINVAL; if (rt->rt_type == RTN_UNICAST) { if (!opt->is_strictroute) return 0; icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24)); return -EINVAL; } if (rt->rt_type != RTN_LOCAL) return -EINVAL; for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); return -EINVAL; } memcpy(&nexthop, &optptr[srrptr-1], 4); orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, dev); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); skb->_skb_refdst = orefdst; return -EINVAL; } refdst_drop(orefdst); if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ iph->daddr = nexthop; opt->is_changed = 1; } if (srrptr <= srrspace) { opt->srr_is_hit = 1; opt->nexthop = nexthop; opt->is_changed = 1; } return 0; } EXPORT_SYMBOL(ip_options_rcv_srr);
4 4 2 1 2 2 2 2 2 1 1 1 1 1 2 2 2 2 18 1 7 10 4 6 2 2 2 3 2 1 3 1 3 3 3 2 1 1 1 1 1 1 1 1 1 4 3 1 2 1 2 2 1 1 3 1 1 1 2 19 2 6 14 2 1 1 5 2 2 1 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 9 1 1 2 2 2 1 1 1 2 1 1 1 3 1 2 3 3 3 1 1 3 3 1 1 1 1 5 1 4 2 2 1 8 1 6 2 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 - wext compat code * * This is temporary code until all wireless functionality is migrated * into cfg80211, when that happens all the exports here go away and * we directly assign the wireless handlers of wireless interfaces. * * Copyright 2008-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2019-2023 Intel Corporation */ #include <linux/export.h> #include <linux/wireless.h> #include <linux/nl80211.h> #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <net/iw_handler.h> #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include "wext-compat.h" #include "core.h" #include "rdev-ops.h" int cfg80211_wext_giwname(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { strcpy(wrqu->name, "IEEE 802.11"); return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwname); int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { __u32 *mode = &wrqu->mode; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; struct vif_params vifparams; enum nl80211_iftype type; int ret; rdev = wiphy_to_rdev(wdev->wiphy); switch (*mode) { case IW_MODE_INFRA: type = NL80211_IFTYPE_STATION; break; case IW_MODE_ADHOC: type = NL80211_IFTYPE_ADHOC; break; case IW_MODE_MONITOR: type = NL80211_IFTYPE_MONITOR; break; default: return -EINVAL; } if (type == wdev->iftype) return 0; memset(&vifparams, 0, sizeof(vifparams)); wiphy_lock(wdev->wiphy); ret = cfg80211_change_iface(rdev, dev, type, &vifparams); wiphy_unlock(wdev->wiphy); return ret; } EXPORT_WEXT_HANDLER(cfg80211_wext_siwmode); int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { __u32 *mode = &wrqu->mode; struct wireless_dev *wdev = dev->ieee80211_ptr; if (!wdev) return -EOPNOTSUPP; switch (wdev->iftype) { case NL80211_IFTYPE_AP: *mode = IW_MODE_MASTER; break; case NL80211_IFTYPE_STATION: *mode = IW_MODE_INFRA; break; case NL80211_IFTYPE_ADHOC: *mode = IW_MODE_ADHOC; break; case NL80211_IFTYPE_MONITOR: *mode = IW_MODE_MONITOR; break; case NL80211_IFTYPE_WDS: *mode = IW_MODE_REPEAT; break; case NL80211_IFTYPE_AP_VLAN: *mode = IW_MODE_SECOND; /* FIXME */ break; default: *mode = IW_MODE_AUTO; break; } return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwmode); int cfg80211_wext_giwrange(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_range *range = (struct iw_range *) extra; enum nl80211_band band; int i, c = 0; if (!wdev) return -EOPNOTSUPP; data->length = sizeof(struct iw_range); memset(range, 0, sizeof(struct iw_range)); range->we_version_compiled = WIRELESS_EXT; range->we_version_source = 21; range->retry_capa = IW_RETRY_LIMIT; range->retry_flags = IW_RETRY_LIMIT; range->min_retry = 0; range->max_retry = 255; range->min_rts = 0; range->max_rts = 2347; range->min_frag = 256; range->max_frag = 2346; range->max_encoding_tokens = 4; range->max_qual.updated = IW_QUAL_NOISE_INVALID; switch (wdev->wiphy->signal_type) { case CFG80211_SIGNAL_TYPE_NONE: break; case CFG80211_SIGNAL_TYPE_MBM: range->max_qual.level = (u8)-110; range->max_qual.qual = 70; range->avg_qual.qual = 35; range->max_qual.updated |= IW_QUAL_DBM; range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; break; case CFG80211_SIGNAL_TYPE_UNSPEC: range->max_qual.level = 100; range->max_qual.qual = 100; range->avg_qual.qual = 50; range->max_qual.updated |= IW_QUAL_QUAL_UPDATED; range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED; break; } range->avg_qual.level = range->max_qual.level / 2; range->avg_qual.noise = range->max_qual.noise / 2; range->avg_qual.updated = range->max_qual.updated; for (i = 0; i < wdev->wiphy->n_cipher_suites; i++) { switch (wdev->wiphy->cipher_suites[i]) { case WLAN_CIPHER_SUITE_TKIP: range->enc_capa |= (IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_WPA); break; case WLAN_CIPHER_SUITE_CCMP: range->enc_capa |= (IW_ENC_CAPA_CIPHER_CCMP | IW_ENC_CAPA_WPA2); break; case WLAN_CIPHER_SUITE_WEP40: range->encoding_size[range->num_encoding_sizes++] = WLAN_KEY_LEN_WEP40; break; case WLAN_CIPHER_SUITE_WEP104: range->encoding_size[range->num_encoding_sizes++] = WLAN_KEY_LEN_WEP104; break; } } for (band = 0; band < NUM_NL80211_BANDS; band ++) { struct ieee80211_supported_band *sband; sband = wdev->wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) { struct ieee80211_channel *chan = &sband->channels[i]; if (!(chan->flags & IEEE80211_CHAN_DISABLED)) { range->freq[c].i = ieee80211_frequency_to_channel( chan->center_freq); range->freq[c].m = chan->center_freq; range->freq[c].e = 6; c++; } } } range->num_channels = c; range->num_frequency = c; IW_EVENT_CAPA_SET_KERNEL(range->event_capa); IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP); IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN); if (wdev->wiphy->max_scan_ssids > 0) range->scan_capa |= IW_SCAN_CAPA_ESSID; return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwrange); /** * cfg80211_wext_freq - get wext frequency for non-"auto" * @freq: the wext freq encoding * * Returns: a frequency, or a negative error code, or 0 for auto. */ int cfg80211_wext_freq(struct iw_freq *freq) { /* * Parse frequency - return 0 for auto and * -EINVAL for impossible things. */ if (freq->e == 0) { enum nl80211_band band = NL80211_BAND_2GHZ; if (freq->m < 0) return 0; if (freq->m > 14) band = NL80211_BAND_5GHZ; return ieee80211_channel_to_frequency(freq->m, band); } else { int i, div = 1000000; for (i = 0; i < freq->e; i++) div /= 10; if (div <= 0) return -EINVAL; return freq->m / div; } } int cfg80211_wext_siwrts(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rts = &wrqu->rts; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 orts = wdev->wiphy->rts_threshold; int err; wiphy_lock(&rdev->wiphy); if (rts->disabled || !rts->fixed) { wdev->wiphy->rts_threshold = (u32) -1; } else if (rts->value < 0) { err = -EINVAL; goto out; } else { wdev->wiphy->rts_threshold = rts->value; } err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD); if (err) wdev->wiphy->rts_threshold = orts; out: wiphy_unlock(&rdev->wiphy); return err; } EXPORT_WEXT_HANDLER(cfg80211_wext_siwrts); int cfg80211_wext_giwrts(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rts = &wrqu->rts; struct wireless_dev *wdev = dev->ieee80211_ptr; rts->value = wdev->wiphy->rts_threshold; rts->disabled = rts->value == (u32) -1; rts->fixed = 1; return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwrts); int cfg80211_wext_siwfrag(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *frag = &wrqu->frag; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 ofrag = wdev->wiphy->frag_threshold; int err; wiphy_lock(&rdev->wiphy); if (frag->disabled || !frag->fixed) { wdev->wiphy->frag_threshold = (u32) -1; } else if (frag->value < 256) { err = -EINVAL; goto out; } else { /* Fragment length must be even, so strip LSB. */ wdev->wiphy->frag_threshold = frag->value & ~0x1; } err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD); if (err) wdev->wiphy->frag_threshold = ofrag; out: wiphy_unlock(&rdev->wiphy); return err; } EXPORT_WEXT_HANDLER(cfg80211_wext_siwfrag); int cfg80211_wext_giwfrag(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *frag = &wrqu->frag; struct wireless_dev *wdev = dev->ieee80211_ptr; frag->value = wdev->wiphy->frag_threshold; frag->disabled = frag->value == (u32) -1; frag->fixed = 1; return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwfrag); static int cfg80211_wext_siwretry(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *retry = &wrqu->retry; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 changed = 0; u8 olong = wdev->wiphy->retry_long; u8 oshort = wdev->wiphy->retry_short; int err; if (retry->disabled || retry->value < 1 || retry->value > 255 || (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT) return -EINVAL; wiphy_lock(&rdev->wiphy); if (retry->flags & IW_RETRY_LONG) { wdev->wiphy->retry_long = retry->value; changed |= WIPHY_PARAM_RETRY_LONG; } else if (retry->flags & IW_RETRY_SHORT) { wdev->wiphy->retry_short = retry->value; changed |= WIPHY_PARAM_RETRY_SHORT; } else { wdev->wiphy->retry_short = retry->value; wdev->wiphy->retry_long = retry->value; changed |= WIPHY_PARAM_RETRY_LONG; changed |= WIPHY_PARAM_RETRY_SHORT; } err = rdev_set_wiphy_params(rdev, changed); if (err) { wdev->wiphy->retry_short = oshort; wdev->wiphy->retry_long = olong; } wiphy_unlock(&rdev->wiphy); return err; } int cfg80211_wext_giwretry(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *retry = &wrqu->retry; struct wireless_dev *wdev = dev->ieee80211_ptr; retry->disabled = 0; if (retry->flags == 0 || (retry->flags & IW_RETRY_SHORT)) { /* * First return short value, iwconfig will ask long value * later if needed */ retry->flags |= IW_RETRY_LIMIT | IW_RETRY_SHORT; retry->value = wdev->wiphy->retry_short; if (wdev->wiphy->retry_long == wdev->wiphy->retry_short) retry->flags |= IW_RETRY_LONG; return 0; } if (retry->flags & IW_RETRY_LONG) { retry->flags = IW_RETRY_LIMIT | IW_RETRY_LONG; retry->value = wdev->wiphy->retry_long; } return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwretry); static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, struct net_device *dev, bool pairwise, const u8 *addr, bool remove, bool tx_key, int idx, struct key_params *params) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err, i; bool rejoin = false; if (wdev->valid_links) return -EINVAL; if (pairwise && !addr) return -EINVAL; /* * In many cases we won't actually need this, but it's better * to do it first in case the allocation fails. Don't use wext. */ if (!wdev->wext.keys) { wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys), GFP_KERNEL); if (!wdev->wext.keys) return -ENOMEM; for (i = 0; i < 4; i++) wdev->wext.keys->params[i].key = wdev->wext.keys->data[i]; } if (wdev->iftype != NL80211_IFTYPE_ADHOC && wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC) { if (!wdev->connected) return -ENOLINK; if (!rdev->ops->set_default_mgmt_key) return -EOPNOTSUPP; if (idx < 4 || idx > 5) return -EINVAL; } else if (idx < 0 || idx > 3) return -EINVAL; if (remove) { err = 0; if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) { /* * If removing the current TX key, we will need to * join a new IBSS without the privacy bit clear. */ if (idx == wdev->wext.default_key && wdev->iftype == NL80211_IFTYPE_ADHOC) { cfg80211_leave_ibss(rdev, wdev->netdev, true); rejoin = true; } if (!pairwise && addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) err = -ENOENT; else err = rdev_del_key(rdev, dev, -1, idx, pairwise, addr); } wdev->wext.connect.privacy = false; /* * Applications using wireless extensions expect to be * able to delete keys that don't exist, so allow that. */ if (err == -ENOENT) err = 0; if (!err) { if (!addr && idx < 4) { memset(wdev->wext.keys->data[idx], 0, sizeof(wdev->wext.keys->data[idx])); wdev->wext.keys->params[idx].key_len = 0; wdev->wext.keys->params[idx].cipher = 0; } if (idx == wdev->wext.default_key) wdev->wext.default_key = -1; else if (idx == wdev->wext.default_mgmt_key) wdev->wext.default_mgmt_key = -1; } if (!err && rejoin) err = cfg80211_ibss_wext_join(rdev, wdev); return err; } if (addr) tx_key = false; if (cfg80211_validate_key_settings(rdev, params, idx, pairwise, addr)) return -EINVAL; err = 0; if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) err = rdev_add_key(rdev, dev, -1, idx, pairwise, addr, params); else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 && params->cipher != WLAN_CIPHER_SUITE_WEP104) return -EINVAL; if (err) return err; /* * We only need to store WEP keys, since they're the only keys that * can be set before a connection is established and persist after * disconnecting. */ if (!addr && (params->cipher == WLAN_CIPHER_SUITE_WEP40 || params->cipher == WLAN_CIPHER_SUITE_WEP104)) { wdev->wext.keys->params[idx] = *params; memcpy(wdev->wext.keys->data[idx], params->key, params->key_len); wdev->wext.keys->params[idx].key = wdev->wext.keys->data[idx]; } if ((params->cipher == WLAN_CIPHER_SUITE_WEP40 || params->cipher == WLAN_CIPHER_SUITE_WEP104) && (tx_key || (!addr && wdev->wext.default_key == -1))) { if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) { /* * If we are getting a new TX key from not having * had one before we need to join a new IBSS with * the privacy bit set. */ if (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->wext.default_key == -1) { cfg80211_leave_ibss(rdev, wdev->netdev, true); rejoin = true; } err = rdev_set_default_key(rdev, dev, -1, idx, true, true); } if (!err) { wdev->wext.default_key = idx; if (rejoin) err = cfg80211_ibss_wext_join(rdev, wdev); } return err; } if (params->cipher == WLAN_CIPHER_SUITE_AES_CMAC && (tx_key || (!addr && wdev->wext.default_mgmt_key == -1))) { if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) err = rdev_set_default_mgmt_key(rdev, dev, -1, idx); if (!err) wdev->wext.default_mgmt_key = idx; return err; } return 0; } static int cfg80211_wext_siwencode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *keybuf) { struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int idx, err; bool remove = false; struct key_params params; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; /* no use -- only MFP (set_default_mgmt_key) is optional */ if (!rdev->ops->del_key || !rdev->ops->add_key || !rdev->ops->set_default_key) return -EOPNOTSUPP; wiphy_lock(&rdev->wiphy); if (wdev->valid_links) { err = -EOPNOTSUPP; goto out; } idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { idx = wdev->wext.default_key; if (idx < 0) idx = 0; } else if (idx < 1 || idx > 4) { err = -EINVAL; goto out; } else { idx--; } if (erq->flags & IW_ENCODE_DISABLED) remove = true; else if (erq->length == 0) { /* No key data - just set the default TX key index */ err = 0; if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) err = rdev_set_default_key(rdev, dev, -1, idx, true, true); if (!err) wdev->wext.default_key = idx; goto out; } memset(&params, 0, sizeof(params)); params.key = keybuf; params.key_len = erq->length; if (erq->length == 5) { params.cipher = WLAN_CIPHER_SUITE_WEP40; } else if (erq->length == 13) { params.cipher = WLAN_CIPHER_SUITE_WEP104; } else if (!remove) { err = -EINVAL; goto out; } err = cfg80211_set_encryption(rdev, dev, false, NULL, remove, wdev->wext.default_key == -1, idx, &params); out: wiphy_unlock(&rdev->wiphy); return err; } static int cfg80211_wext_siwencodeext(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct iw_encode_ext *ext = (struct iw_encode_ext *) extra; const u8 *addr; int idx; bool remove = false; struct key_params params; u32 cipher; int ret; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; /* no use -- only MFP (set_default_mgmt_key) is optional */ if (!rdev->ops->del_key || !rdev->ops->add_key || !rdev->ops->set_default_key) return -EOPNOTSUPP; if (wdev->valid_links) return -EOPNOTSUPP; switch (ext->alg) { case IW_ENCODE_ALG_NONE: remove = true; cipher = 0; break; case IW_ENCODE_ALG_WEP: if (ext->key_len == 5) cipher = WLAN_CIPHER_SUITE_WEP40; else if (ext->key_len == 13) cipher = WLAN_CIPHER_SUITE_WEP104; else return -EINVAL; break; case IW_ENCODE_ALG_TKIP: cipher = WLAN_CIPHER_SUITE_TKIP; break; case IW_ENCODE_ALG_CCMP: cipher = WLAN_CIPHER_SUITE_CCMP; break; case IW_ENCODE_ALG_AES_CMAC: cipher = WLAN_CIPHER_SUITE_AES_CMAC; break; default: return -EOPNOTSUPP; } if (erq->flags & IW_ENCODE_DISABLED) remove = true; idx = erq->flags & IW_ENCODE_INDEX; if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) { if (idx < 4 || idx > 5) { idx = wdev->wext.default_mgmt_key; if (idx < 0) return -EINVAL; } else idx--; } else { if (idx < 1 || idx > 4) { idx = wdev->wext.default_key; if (idx < 0) return -EINVAL; } else idx--; } addr = ext->addr.sa_data; if (is_broadcast_ether_addr(addr)) addr = NULL; memset(&params, 0, sizeof(params)); params.key = ext->key; params.key_len = ext->key_len; params.cipher = cipher; if (ext->ext_flags & IW_ENCODE_EXT_RX_SEQ_VALID) { params.seq = ext->rx_seq; params.seq_len = 6; } wiphy_lock(wdev->wiphy); ret = cfg80211_set_encryption( rdev, dev, !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY), addr, remove, ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, idx, &params); wiphy_unlock(wdev->wiphy); return ret; } static int cfg80211_wext_giwencode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *keybuf) { struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; int idx; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { idx = wdev->wext.default_key; if (idx < 0) idx = 0; } else if (idx < 1 || idx > 4) return -EINVAL; else idx--; erq->flags = idx + 1; if (!wdev->wext.keys || !wdev->wext.keys->params[idx].cipher) { erq->flags |= IW_ENCODE_DISABLED; erq->length = 0; return 0; } erq->length = min_t(size_t, erq->length, wdev->wext.keys->params[idx].key_len); memcpy(keybuf, wdev->wext.keys->params[idx].key, erq->length); erq->flags |= IW_ENCODE_ENABLED; return 0; } static int cfg80211_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_freq *wextfreq = &wrqu->freq; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_chan_def chandef = { .width = NL80211_CHAN_WIDTH_20_NOHT, }; int freq, ret; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); break; case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); break; case NL80211_IFTYPE_MONITOR: freq = cfg80211_wext_freq(wextfreq); if (freq < 0) { ret = freq; break; } if (freq == 0) { ret = -EINVAL; break; } chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); if (!chandef.chan) { ret = -EINVAL; break; } ret = cfg80211_set_monitor_channel(rdev, &chandef); break; case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); if (freq < 0) { ret = freq; break; } if (freq == 0) { ret = -EINVAL; break; } chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); if (!chandef.chan) { ret = -EINVAL; break; } ret = cfg80211_set_mesh_channel(rdev, wdev, &chandef); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_freq *freq = &wrqu->freq; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_chan_def chandef = {}; int ret; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); break; case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); break; case NL80211_IFTYPE_MONITOR: if (!rdev->ops->get_channel) { ret = -EINVAL; break; } ret = rdev_get_channel(rdev, wdev, 0, &chandef); if (ret) break; freq->m = chandef.chan->center_freq; freq->e = 6; ret = 0; break; default: ret = -EINVAL; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_siwtxpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *data, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); enum nl80211_tx_power_setting type; int dbm = 0; int ret; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; if (data->txpower.flags & IW_TXPOW_RANGE) return -EINVAL; if (!rdev->ops->set_tx_power) return -EOPNOTSUPP; /* only change when not disabling */ if (!data->txpower.disabled) { rfkill_set_sw_state(rdev->wiphy.rfkill, false); if (data->txpower.fixed) { /* * wext doesn't support negative values, see * below where it's for automatic */ if (data->txpower.value < 0) return -EINVAL; dbm = data->txpower.value; type = NL80211_TX_POWER_FIXED; /* TODO: do regulatory check! */ } else { /* * Automatic power level setting, max being the value * passed in from userland. */ if (data->txpower.value < 0) { type = NL80211_TX_POWER_AUTOMATIC; } else { dbm = data->txpower.value; type = NL80211_TX_POWER_LIMITED; } } } else { if (rfkill_set_sw_state(rdev->wiphy.rfkill, true)) schedule_work(&rdev->rfkill_block); return 0; } wiphy_lock(&rdev->wiphy); ret = rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_giwtxpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *data, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int err, val; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; if (data->txpower.flags & IW_TXPOW_RANGE) return -EINVAL; if (!rdev->ops->get_tx_power) return -EOPNOTSUPP; wiphy_lock(&rdev->wiphy); err = rdev_get_tx_power(rdev, wdev, &val); wiphy_unlock(&rdev->wiphy); if (err) return err; /* well... oh well */ data->txpower.fixed = 1; data->txpower.disabled = rfkill_blocked(rdev->wiphy.rfkill); data->txpower.value = val; data->txpower.flags = IW_TXPOW_DBM; return 0; } static int cfg80211_set_auth_alg(struct wireless_dev *wdev, s32 auth_alg) { int nr_alg = 0; if (!auth_alg) return -EINVAL; if (auth_alg & ~(IW_AUTH_ALG_OPEN_SYSTEM | IW_AUTH_ALG_SHARED_KEY | IW_AUTH_ALG_LEAP)) return -EINVAL; if (auth_alg & IW_AUTH_ALG_OPEN_SYSTEM) { nr_alg++; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; } if (auth_alg & IW_AUTH_ALG_SHARED_KEY) { nr_alg++; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_SHARED_KEY; } if (auth_alg & IW_AUTH_ALG_LEAP) { nr_alg++; wdev->wext.connect.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; } if (nr_alg > 1) wdev->wext.connect.auth_type = NL80211_AUTHTYPE_AUTOMATIC; return 0; } static int cfg80211_set_wpa_version(struct wireless_dev *wdev, u32 wpa_versions) { if (wpa_versions & ~(IW_AUTH_WPA_VERSION_WPA | IW_AUTH_WPA_VERSION_WPA2| IW_AUTH_WPA_VERSION_DISABLED)) return -EINVAL; if ((wpa_versions & IW_AUTH_WPA_VERSION_DISABLED) && (wpa_versions & (IW_AUTH_WPA_VERSION_WPA| IW_AUTH_WPA_VERSION_WPA2))) return -EINVAL; if (wpa_versions & IW_AUTH_WPA_VERSION_DISABLED) wdev->wext.connect.crypto.wpa_versions &= ~(NL80211_WPA_VERSION_1|NL80211_WPA_VERSION_2); if (wpa_versions & IW_AUTH_WPA_VERSION_WPA) wdev->wext.connect.crypto.wpa_versions |= NL80211_WPA_VERSION_1; if (wpa_versions & IW_AUTH_WPA_VERSION_WPA2) wdev->wext.connect.crypto.wpa_versions |= NL80211_WPA_VERSION_2; return 0; } static int cfg80211_set_cipher_group(struct wireless_dev *wdev, u32 cipher) { if (cipher & IW_AUTH_CIPHER_WEP40) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_WEP40; else if (cipher & IW_AUTH_CIPHER_WEP104) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_WEP104; else if (cipher & IW_AUTH_CIPHER_TKIP) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_TKIP; else if (cipher & IW_AUTH_CIPHER_CCMP) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_CCMP; else if (cipher & IW_AUTH_CIPHER_AES_CMAC) wdev->wext.connect.crypto.cipher_group = WLAN_CIPHER_SUITE_AES_CMAC; else if (cipher & IW_AUTH_CIPHER_NONE) wdev->wext.connect.crypto.cipher_group = 0; else return -EINVAL; return 0; } static int cfg80211_set_cipher_pairwise(struct wireless_dev *wdev, u32 cipher) { int nr_ciphers = 0; u32 *ciphers_pairwise = wdev->wext.connect.crypto.ciphers_pairwise; if (cipher & IW_AUTH_CIPHER_WEP40) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP40; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_WEP104) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_WEP104; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_TKIP) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_TKIP; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_CCMP) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_CCMP; nr_ciphers++; } if (cipher & IW_AUTH_CIPHER_AES_CMAC) { ciphers_pairwise[nr_ciphers] = WLAN_CIPHER_SUITE_AES_CMAC; nr_ciphers++; } BUILD_BUG_ON(NL80211_MAX_NR_CIPHER_SUITES < 5); wdev->wext.connect.crypto.n_ciphers_pairwise = nr_ciphers; return 0; } static int cfg80211_set_key_mgt(struct wireless_dev *wdev, u32 key_mgt) { int nr_akm_suites = 0; if (key_mgt & ~(IW_AUTH_KEY_MGMT_802_1X | IW_AUTH_KEY_MGMT_PSK)) return -EINVAL; if (key_mgt & IW_AUTH_KEY_MGMT_802_1X) { wdev->wext.connect.crypto.akm_suites[nr_akm_suites] = WLAN_AKM_SUITE_8021X; nr_akm_suites++; } if (key_mgt & IW_AUTH_KEY_MGMT_PSK) { wdev->wext.connect.crypto.akm_suites[nr_akm_suites] = WLAN_AKM_SUITE_PSK; nr_akm_suites++; } wdev->wext.connect.crypto.n_akm_suites = nr_akm_suites; return 0; } static int cfg80211_wext_siwauth(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *data = &wrqu->param; struct wireless_dev *wdev = dev->ieee80211_ptr; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; switch (data->flags & IW_AUTH_INDEX) { case IW_AUTH_PRIVACY_INVOKED: wdev->wext.connect.privacy = data->value; return 0; case IW_AUTH_WPA_VERSION: return cfg80211_set_wpa_version(wdev, data->value); case IW_AUTH_CIPHER_GROUP: return cfg80211_set_cipher_group(wdev, data->value); case IW_AUTH_KEY_MGMT: return cfg80211_set_key_mgt(wdev, data->value); case IW_AUTH_CIPHER_PAIRWISE: return cfg80211_set_cipher_pairwise(wdev, data->value); case IW_AUTH_80211_AUTH_ALG: return cfg80211_set_auth_alg(wdev, data->value); case IW_AUTH_WPA_ENABLED: case IW_AUTH_RX_UNENCRYPTED_EAPOL: case IW_AUTH_DROP_UNENCRYPTED: case IW_AUTH_MFP: return 0; default: return -EOPNOTSUPP; } } static int cfg80211_wext_giwauth(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { /* XXX: what do we need? */ return -EOPNOTSUPP; } static int cfg80211_wext_siwpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *wrq = &wrqu->power; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); bool ps; int timeout = wdev->ps_timeout; int err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EINVAL; if (!rdev->ops->set_power_mgmt) return -EOPNOTSUPP; if (wrq->disabled) { ps = false; } else { switch (wrq->flags & IW_POWER_MODE) { case IW_POWER_ON: /* If not specified */ case IW_POWER_MODE: /* If set all mask */ case IW_POWER_ALL_R: /* If explicitely state all */ ps = true; break; default: /* Otherwise we ignore */ return -EINVAL; } if (wrq->flags & ~(IW_POWER_MODE | IW_POWER_TIMEOUT)) return -EINVAL; if (wrq->flags & IW_POWER_TIMEOUT) timeout = wrq->value / 1000; } wiphy_lock(&rdev->wiphy); err = rdev_set_power_mgmt(rdev, dev, ps, timeout); wiphy_unlock(&rdev->wiphy); if (err) return err; wdev->ps = ps; wdev->ps_timeout = timeout; return 0; } static int cfg80211_wext_giwpower(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *wrq = &wrqu->power; struct wireless_dev *wdev = dev->ieee80211_ptr; wrq->disabled = !wdev->ps; return 0; } static int cfg80211_wext_siwrate(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rate = &wrqu->bitrate; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bitrate_mask mask; u32 fixed, maxrate; struct ieee80211_supported_band *sband; int band, ridx, ret; bool match = false; if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; memset(&mask, 0, sizeof(mask)); fixed = 0; maxrate = (u32)-1; if (rate->value < 0) { /* nothing */ } else if (rate->fixed) { fixed = rate->value / 100000; } else { maxrate = rate->value / 100000; } for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wdev->wiphy->bands[band]; if (sband == NULL) continue; for (ridx = 0; ridx < sband->n_bitrates; ridx++) { struct ieee80211_rate *srate = &sband->bitrates[ridx]; if (fixed == srate->bitrate) { mask.control[band].legacy = 1 << ridx; match = true; break; } if (srate->bitrate <= maxrate) { mask.control[band].legacy |= 1 << ridx; match = true; } } } if (!match) return -EINVAL; wiphy_lock(&rdev->wiphy); if (dev->ieee80211_ptr->valid_links) ret = -EOPNOTSUPP; else ret = rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask); wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_giwrate(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_param *rate = &wrqu->bitrate; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct station_info sinfo = {}; u8 addr[ETH_ALEN]; int err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!rdev->ops->get_station) return -EOPNOTSUPP; err = 0; if (!wdev->valid_links && wdev->links[0].client.current_bss) memcpy(addr, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); else err = -EOPNOTSUPP; if (err) return err; wiphy_lock(&rdev->wiphy); err = rdev_get_station(rdev, dev, addr, &sinfo); wiphy_unlock(&rdev->wiphy); if (err) return err; if (!(sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE))) { err = -EOPNOTSUPP; goto free; } rate->value = 100000 * cfg80211_calculate_bitrate(&sinfo.txrate); free: cfg80211_sinfo_release_content(&sinfo); return err; } /* Get wireless statistics. Called by /proc/net/wireless and by SIOCGIWSTATS */ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); /* we are under RTNL - globally locked - so can use static structs */ static struct iw_statistics wstats; static struct station_info sinfo = {}; u8 bssid[ETH_ALEN]; int ret; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) return NULL; if (!rdev->ops->get_station) return NULL; /* Grab BSSID of current BSS, if any */ wiphy_lock(&rdev->wiphy); if (wdev->valid_links || !wdev->links[0].client.current_bss) { wiphy_unlock(&rdev->wiphy); return NULL; } memcpy(bssid, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); memset(&sinfo, 0, sizeof(sinfo)); ret = rdev_get_station(rdev, dev, bssid, &sinfo); wiphy_unlock(&rdev->wiphy); if (ret) return NULL; memset(&wstats, 0, sizeof(wstats)); switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { int sig = sinfo.signal; wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; wstats.qual.updated |= IW_QUAL_DBM; wstats.qual.level = sig; if (sig < -110) sig = -110; else if (sig > -40) sig = -40; wstats.qual.qual = sig + 110; break; } fallthrough; case CFG80211_SIGNAL_TYPE_UNSPEC: if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) { wstats.qual.updated |= IW_QUAL_LEVEL_UPDATED; wstats.qual.updated |= IW_QUAL_QUAL_UPDATED; wstats.qual.level = sinfo.signal; wstats.qual.qual = sinfo.signal; break; } fallthrough; default: wstats.qual.updated |= IW_QUAL_LEVEL_INVALID; wstats.qual.updated |= IW_QUAL_QUAL_INVALID; } wstats.qual.updated |= IW_QUAL_NOISE_INVALID; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC)) wstats.discard.misc = sinfo.rx_dropped_misc; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) wstats.discard.retries = sinfo.tx_failed; cfg80211_sinfo_release_content(&sinfo); return &wstats; } static int cfg80211_wext_siwap(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); break; case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_giwap(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); break; case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_siwessid(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *ssid) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_siwessid(dev, info, data, ssid); break; case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_siwessid(dev, info, data, ssid); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_giwessid(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *ssid) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; data->flags = 0; data->length = 0; wiphy_lock(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: ret = cfg80211_ibss_wext_giwessid(dev, info, data, ssid); break; case NL80211_IFTYPE_STATION: ret = cfg80211_mgd_wext_giwessid(dev, info, data, ssid); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static int cfg80211_wext_siwpmksa(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmksa cfg_pmksa; struct iw_pmksa *pmksa = (struct iw_pmksa *)extra; int ret; memset(&cfg_pmksa, 0, sizeof(struct cfg80211_pmksa)); if (wdev->iftype != NL80211_IFTYPE_STATION) return -EINVAL; cfg_pmksa.bssid = pmksa->bssid.sa_data; cfg_pmksa.pmkid = pmksa->pmkid; wiphy_lock(&rdev->wiphy); switch (pmksa->cmd) { case IW_PMKSA_ADD: if (!rdev->ops->set_pmksa) { ret = -EOPNOTSUPP; break; } ret = rdev_set_pmksa(rdev, dev, &cfg_pmksa); break; case IW_PMKSA_REMOVE: if (!rdev->ops->del_pmksa) { ret = -EOPNOTSUPP; break; } ret = rdev_del_pmksa(rdev, dev, &cfg_pmksa); break; case IW_PMKSA_FLUSH: if (!rdev->ops->flush_pmksa) { ret = -EOPNOTSUPP; break; } ret = rdev_flush_pmksa(rdev, dev); break; default: ret = -EOPNOTSUPP; break; } wiphy_unlock(&rdev->wiphy); return ret; } static const iw_handler cfg80211_handlers[] = { IW_HANDLER(SIOCGIWNAME, cfg80211_wext_giwname), IW_HANDLER(SIOCSIWFREQ, cfg80211_wext_siwfreq), IW_HANDLER(SIOCGIWFREQ, cfg80211_wext_giwfreq), IW_HANDLER(SIOCSIWMODE, cfg80211_wext_siwmode), IW_HANDLER(SIOCGIWMODE, cfg80211_wext_giwmode), IW_HANDLER(SIOCGIWRANGE, cfg80211_wext_giwrange), IW_HANDLER(SIOCSIWAP, cfg80211_wext_siwap), IW_HANDLER(SIOCGIWAP, cfg80211_wext_giwap), IW_HANDLER(SIOCSIWMLME, cfg80211_wext_siwmlme), IW_HANDLER(SIOCSIWSCAN, cfg80211_wext_siwscan), IW_HANDLER(SIOCGIWSCAN, cfg80211_wext_giwscan), IW_HANDLER(SIOCSIWESSID, cfg80211_wext_siwessid), IW_HANDLER(SIOCGIWESSID, cfg80211_wext_giwessid), IW_HANDLER(SIOCSIWRATE, cfg80211_wext_siwrate), IW_HANDLER(SIOCGIWRATE, cfg80211_wext_giwrate), IW_HANDLER(SIOCSIWRTS, cfg80211_wext_siwrts), IW_HANDLER(SIOCGIWRTS, cfg80211_wext_giwrts), IW_HANDLER(SIOCSIWFRAG, cfg80211_wext_siwfrag), IW_HANDLER(SIOCGIWFRAG, cfg80211_wext_giwfrag), IW_HANDLER(SIOCSIWTXPOW, cfg80211_wext_siwtxpower), IW_HANDLER(SIOCGIWTXPOW, cfg80211_wext_giwtxpower), IW_HANDLER(SIOCSIWRETRY, cfg80211_wext_siwretry), IW_HANDLER(SIOCGIWRETRY, cfg80211_wext_giwretry), IW_HANDLER(SIOCSIWENCODE, cfg80211_wext_siwencode), IW_HANDLER(SIOCGIWENCODE, cfg80211_wext_giwencode), IW_HANDLER(SIOCSIWPOWER, cfg80211_wext_siwpower), IW_HANDLER(SIOCGIWPOWER, cfg80211_wext_giwpower), IW_HANDLER(SIOCSIWGENIE, cfg80211_wext_siwgenie), IW_HANDLER(SIOCSIWAUTH, cfg80211_wext_siwauth), IW_HANDLER(SIOCGIWAUTH, cfg80211_wext_giwauth), IW_HANDLER(SIOCSIWENCODEEXT, cfg80211_wext_siwencodeext), IW_HANDLER(SIOCSIWPMKSA, cfg80211_wext_siwpmksa), }; const struct iw_handler_def cfg80211_wext_handler = { .num_standard = ARRAY_SIZE(cfg80211_handlers), .standard = cfg80211_handlers, .get_wireless_stats = cfg80211_wireless_stats, };
21 21 10 12 149 154 111 33 33 5 1 1 90 90 235 7 17 49 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 /* * net/tipc/msg.h: Include file for TIPC message header routines * * Copyright (c) 2000-2007, 2014-2017 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_MSG_H #define _TIPC_MSG_H #include <linux/tipc.h> #include "core.h" /* * Constants and routines used to read and write TIPC payload message headers * * Note: Some items are also used with TIPC internal message headers */ #define TIPC_VERSION 2 struct plist; /* * Payload message users are defined in TIPC's public API: * - TIPC_LOW_IMPORTANCE * - TIPC_MEDIUM_IMPORTANCE * - TIPC_HIGH_IMPORTANCE * - TIPC_CRITICAL_IMPORTANCE */ #define TIPC_SYSTEM_IMPORTANCE 4 /* * Payload message types */ #define TIPC_CONN_MSG 0 #define TIPC_MCAST_MSG 1 #define TIPC_NAMED_MSG 2 #define TIPC_DIRECT_MSG 3 #define TIPC_GRP_MEMBER_EVT 4 #define TIPC_GRP_BCAST_MSG 5 #define TIPC_GRP_MCAST_MSG 6 #define TIPC_GRP_UCAST_MSG 7 /* * Internal message users */ #define BCAST_PROTOCOL 5 #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 #define GROUP_PROTOCOL 9 #define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 #define MSG_CRYPTO 14 #define SOCK_WAKEUP 14 /* pseudo user */ #define TOP_SRV 15 /* pseudo user */ /* * Message header sizes */ #define SHORT_H_SIZE 24 /* In-cluster basic payload message */ #define BASIC_H_SIZE 32 /* Basic payload message */ #define NAMED_H_SIZE 40 /* Named payload message */ #define MCAST_H_SIZE 44 /* Multicast payload message */ #define GROUP_H_SIZE 44 /* Group payload message */ #define INT_H_SIZE 40 /* Internal messages */ #define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) #define TIPC_MEDIA_INFO_OFFSET 5 extern const int one_page_mtu; struct tipc_skb_cb { union { struct { struct sk_buff *tail; unsigned long nxt_retr; unsigned long retr_stamp; u32 bytes_read; u32 orig_member; u16 chain_imp; u16 ackers; u16 retr_cnt; } __packed; #ifdef CONFIG_TIPC_CRYPTO struct { struct tipc_crypto *rx; struct tipc_aead *last; u8 recurs; } tx_clone_ctx __packed; #endif } __packed; union { struct { u8 validated:1; #ifdef CONFIG_TIPC_CRYPTO u8 encrypted:1; u8 decrypted:1; #define SKB_PROBING 1 #define SKB_GRACING 2 u8 xmit_type:2; u8 tx_clone_deferred:1; #endif }; u8 flags; }; u8 reserved; #ifdef CONFIG_TIPC_CRYPTO void *crypto_ctx; #endif } __packed; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) struct tipc_msg { __be32 hdr[15]; }; /* struct tipc_gap_ack - TIPC Gap ACK block * @ack: seqno of the last consecutive packet in link deferdq * @gap: number of gap packets since the last ack * * E.g: * link deferdq: 1 2 3 4 10 11 13 14 15 20 * --> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0> */ struct tipc_gap_ack { __be16 ack; __be16 gap; }; /* struct tipc_gap_ack_blks * @len: actual length of the record * @ugack_cnt: number of Gap ACK blocks for unicast (following the broadcast * ones) * @start_index: starting index for "valid" broadcast Gap ACK blocks * @bgack_cnt: number of Gap ACK blocks for broadcast in the record * @gacks: array of Gap ACK blocks * * 31 16 15 0 * +-------------+-------------+-------------+-------------+ * | bgack_cnt | ugack_cnt | len | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > bc gacks * : : : | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > uc gacks * : : : | * +-------------+-------------+-------------+-------------+ - */ struct tipc_gap_ack_blks { __be16 len; union { u8 ugack_cnt; u8 start_index; }; u8 bgack_cnt; struct tipc_gap_ack gacks[]; }; #define MAX_GAP_ACK_BLKS 128 #define MAX_GAP_ACK_BLKS_SZ (sizeof(struct tipc_gap_ack_blks) + \ sizeof(struct tipc_gap_ack) * MAX_GAP_ACK_BLKS) static inline struct tipc_msg *buf_msg(struct sk_buff *skb) { return (struct tipc_msg *)skb->data; } static inline u32 msg_word(struct tipc_msg *m, u32 pos) { return ntohl(m->hdr[pos]); } static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val) { m->hdr[w] = htonl(val); } static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask) { return (msg_word(m, w) >> pos) & mask; } static inline void msg_set_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask, u32 val) { val = (val & mask) << pos; mask = mask << pos; m->hdr[w] &= ~htonl(mask); m->hdr[w] |= htonl(val); } /* * Word 0 */ static inline u32 msg_version(struct tipc_msg *m) { return msg_bits(m, 0, 29, 7); } static inline void msg_set_version(struct tipc_msg *m) { msg_set_bits(m, 0, 29, 7, TIPC_VERSION); } static inline u32 msg_user(struct tipc_msg *m) { return msg_bits(m, 0, 25, 0xf); } static inline u32 msg_isdata(struct tipc_msg *m) { return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE; } static inline void msg_set_user(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 25, 0xf, n); } static inline u32 msg_hdr_sz(struct tipc_msg *m) { return msg_bits(m, 0, 21, 0xf) << 2; } static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 21, 0xf, n>>2); } static inline u32 msg_size(struct tipc_msg *m) { return msg_bits(m, 0, 0, 0x1ffff); } static inline u32 msg_blocks(struct tipc_msg *m) { return (msg_size(m) / 1024) + 1; } static inline u32 msg_data_sz(struct tipc_msg *m) { return msg_size(m) - msg_hdr_sz(m); } static inline int msg_non_seq(struct tipc_msg *m) { return msg_bits(m, 0, 20, 1); } static inline void msg_set_non_seq(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 20, 1, n); } static inline int msg_is_syn(struct tipc_msg *m) { return msg_bits(m, 0, 17, 1); } static inline void msg_set_syn(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 17, 1, d); } static inline int msg_dest_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_is_keepalive(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_src_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 18, 1, d); } static inline int msg_ack_required(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_ack_required(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline int msg_nagle_ack(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_nagle_ack(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); } static inline void msg_set_is_rcast(struct tipc_msg *m, bool d) { msg_set_bits(m, 0, 18, 0x1, d); } static inline void msg_set_size(struct tipc_msg *m, u32 sz) { m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); } static inline unchar *msg_data(struct tipc_msg *m) { return ((unchar *)m) + msg_hdr_sz(m); } static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m) { return (struct tipc_msg *)msg_data(m); } /* * Word 1 */ static inline u32 msg_type(struct tipc_msg *m) { return msg_bits(m, 1, 29, 0x7); } static inline void msg_set_type(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 29, 0x7, n); } static inline int msg_in_group(struct tipc_msg *m) { int mtyp = msg_type(m); return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG; } static inline bool msg_is_grp_evt(struct tipc_msg *m) { return msg_type(m) == TIPC_GRP_MEMBER_EVT; } static inline u32 msg_named(struct tipc_msg *m) { return msg_type(m) == TIPC_NAMED_MSG; } static inline u32 msg_mcast(struct tipc_msg *m) { int mtyp = msg_type(m); return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) { return msg_type(m) == TIPC_CONN_MSG; } static inline u32 msg_direct(struct tipc_msg *m) { return msg_type(m) == TIPC_DIRECT_MSG; } static inline u32 msg_errcode(struct tipc_msg *m) { return msg_bits(m, 1, 25, 0xf); } static inline void msg_set_errcode(struct tipc_msg *m, u32 err) { msg_set_bits(m, 1, 25, 0xf, err); } static inline void msg_set_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 28, 0x1, 1); } static inline u32 msg_is_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 28, 0x1); } static inline void msg_set_last_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 27, 0x1, 1); } static inline u32 msg_is_last_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 27, 0x1); } static inline void msg_set_non_legacy(struct tipc_msg *m) { msg_set_bits(m, 1, 26, 0x1, 1); } static inline u32 msg_is_legacy(struct tipc_msg *m) { return !msg_bits(m, 1, 26, 0x1); } static inline u32 msg_reroute_cnt(struct tipc_msg *m) { return msg_bits(m, 1, 21, 0xf); } static inline void msg_incr_reroute_cnt(struct tipc_msg *m) { msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1); } static inline u32 msg_lookup_scope(struct tipc_msg *m) { return msg_bits(m, 1, 19, 0x3); } static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 19, 0x3, n); } static inline u16 msg_bcast_ack(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* Note: reusing bits in word 1 for ACTIVATE_MSG only, to re-synch * link peer session number */ static inline bool msg_dest_session_valid(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1); } static inline void msg_set_dest_session_valid(struct tipc_msg *m, bool valid) { msg_set_bits(m, 1, 16, 0x1, valid); } static inline u16 msg_dest_session(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_dest_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* * Word 2 */ static inline u16 msg_ack(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } static inline void msg_set_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u16 msg_seqno(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Words 3-10 */ static inline u32 msg_importance(struct tipc_msg *m) { int usr = msg_user(m); if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) return usr; if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) return msg_bits(m, 9, 0, 0x7); return TIPC_SYSTEM_IMPORTANCE; } static inline void msg_set_importance(struct tipc_msg *m, u32 i) { int usr = msg_user(m); if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) msg_set_bits(m, 9, 0, 0x7, i); else if (i < TIPC_SYSTEM_IMPORTANCE) msg_set_user(m, i); else pr_warn("Trying to set illegal importance in message\n"); } static inline u32 msg_prevnode(struct tipc_msg *m) { return msg_word(m, 3); } static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 3, a); } static inline u32 msg_origport(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = msg_inner_hdr(m); return msg_word(m, 4); } static inline void msg_set_origport(struct tipc_msg *m, u32 p) { msg_set_word(m, 4, p); } static inline u16 msg_named_seqno(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_named_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_destport(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_destport(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline u32 msg_mc_netid(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline int msg_short(struct tipc_msg *m) { return msg_hdr_sz(m) == SHORT_H_SIZE; } static inline u32 msg_orignode(struct tipc_msg *m) { if (likely(msg_short(m))) return msg_prevnode(m); return msg_word(m, 6); } static inline void msg_set_orignode(struct tipc_msg *m, u32 a) { msg_set_word(m, 6, a); } static inline u32 msg_destnode(struct tipc_msg *m) { return msg_word(m, 7); } static inline void msg_set_destnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 7, a); } static inline u32 msg_nametype(struct tipc_msg *m) { return msg_word(m, 8); } static inline void msg_set_nametype(struct tipc_msg *m, u32 n) { msg_set_word(m, 8, n); } static inline u32 msg_nameinst(struct tipc_msg *m) { return msg_word(m, 9); } static inline u32 msg_namelower(struct tipc_msg *m) { return msg_nameinst(m); } static inline void msg_set_namelower(struct tipc_msg *m, u32 n) { msg_set_word(m, 9, n); } static inline void msg_set_nameinst(struct tipc_msg *m, u32 n) { msg_set_namelower(m, n); } static inline u32 msg_nameupper(struct tipc_msg *m) { return msg_word(m, 10); } static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) { msg_set_word(m, 10, n); } /* * Constants and routines used to read and write TIPC internal message headers */ /* * Connection management protocol message types */ #define CONN_PROBE 0 #define CONN_PROBE_REPLY 1 #define CONN_ACK 2 /* * Name distributor message types */ #define PUBLICATION 0 #define WITHDRAWAL 1 /* * Segmentation message types */ #define FIRST_FRAGMENT 0 #define FRAGMENT 1 #define LAST_FRAGMENT 2 /* * Link management protocol message types */ #define STATE_MSG 0 #define RESET_MSG 1 #define ACTIVATE_MSG 2 /* * Changeover tunnel message types */ #define SYNCH_MSG 0 #define FAILOVER_MSG 1 /* * Config protocol message types */ #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 #define DSC_TRIAL_MSG 2 #define DSC_TRIAL_FAIL_MSG 3 /* * Group protocol message types */ #define GRP_JOIN_MSG 0 #define GRP_LEAVE_MSG 1 #define GRP_ADV_MSG 2 #define GRP_ACK_MSG 3 #define GRP_RECLAIM_MSG 4 #define GRP_REMIT_MSG 5 /* Crypto message types */ #define KEY_DISTR_MSG 0 /* * Word 1 */ static inline u32 msg_seq_gap(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1fff); } static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 16, 0x1fff, n); } static inline u32 msg_node_sig(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_node_sig(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 0, 0xffff, n); } static inline u32 msg_node_capabilities(struct tipc_msg *m) { return msg_bits(m, 1, 15, 0x1fff); } static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 15, 0x1fff, n); } /* * Word 2 */ static inline u32 msg_dest_domain(struct tipc_msg *m) { return msg_word(m, 2); } static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n) { msg_set_word(m, 2, n); } static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u32 msg_bcgap_to(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Word 4 */ static inline u32 msg_last_bcast(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline u32 msg_bc_snd_nxt(struct tipc_msg *m) { return msg_last_bcast(m) + 1; } static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u32 msg_nof_fragms(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_nof_fragms(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_fragm_no(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u16 msg_next_sent(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_next_sent(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_bc_netid(struct tipc_msg *m) { return msg_word(m, 4); } static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id) { msg_set_word(m, 4, id); } static inline u32 msg_link_selector(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = (void *)msg_data(m); return msg_bits(m, 4, 0, 1); } /* * Word 5 */ static inline u16 msg_session(struct tipc_msg *m) { return msg_bits(m, 5, 16, 0xffff); } static inline void msg_set_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 5, 16, 0xffff, n); } static inline u32 msg_probe(struct tipc_msg *m) { return msg_bits(m, 5, 0, 1); } static inline void msg_set_probe(struct tipc_msg *m, u32 val) { msg_set_bits(m, 5, 0, 1, val); } static inline char msg_net_plane(struct tipc_msg *m) { return msg_bits(m, 5, 1, 7) + 'A'; } static inline void msg_set_net_plane(struct tipc_msg *m, char n) { msg_set_bits(m, 5, 1, 7, (n - 'A')); } static inline u32 msg_linkprio(struct tipc_msg *m) { return msg_bits(m, 5, 4, 0x1f); } static inline void msg_set_linkprio(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 4, 0x1f, n); } static inline u32 msg_bearer_id(struct tipc_msg *m) { return msg_bits(m, 5, 9, 0x7); } static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 9, 0x7, n); } static inline u32 msg_redundant_link(struct tipc_msg *m) { return msg_bits(m, 5, 12, 0x1); } static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r) { msg_set_bits(m, 5, 12, 0x1, r); } static inline u32 msg_peer_stopping(struct tipc_msg *m) { return msg_bits(m, 5, 13, 0x1); } static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s) { msg_set_bits(m, 5, 13, 0x1, s); } static inline bool msg_bc_ack_invalid(struct tipc_msg *m) { switch (msg_user(m)) { case BCAST_PROTOCOL: case NAME_DISTRIBUTOR: case LINK_PROTOCOL: return msg_bits(m, 5, 14, 0x1); default: return false; } } static inline void msg_set_bc_ack_invalid(struct tipc_msg *m, bool invalid) { msg_set_bits(m, 5, 14, 0x1, invalid); } static inline char *msg_media_addr(struct tipc_msg *m) { return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; } static inline u32 msg_bc_gap(struct tipc_msg *m) { return msg_bits(m, 8, 0, 0x3ff); } static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 8, 0, 0x3ff, n); } /* * Word 9 */ static inline u16 msg_msgcnt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_adv_win(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_adv_win(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u32 msg_max_pkt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff) * 4; } static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, (n / 4)); } static inline u32 msg_link_tolerance(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_bc_acked(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_remitted(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } /* Word 10 */ static inline u16 msg_grp_evt(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x3); } static inline void msg_set_grp_evt(struct tipc_msg *m, int n) { msg_set_bits(m, 10, 0, 0x3, n); } static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x1); } static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n) { msg_set_bits(m, 10, 0, 0x1, n); } static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) { return msg_bits(m, 10, 16, 0xffff); } static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n) { msg_set_bits(m, 10, 16, 0xffff, n); } static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) return true; if (msg_type(m) == STATE_MSG) return true; return false; } static inline bool msg_peer_node_is_up(struct tipc_msg *m) { if (msg_peer_link_is_up(m)) return true; return msg_redundant_link(m); } static inline bool msg_is_reset(struct tipc_msg *hdr) { return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } /* Word 13 */ static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) { msg_set_word(m, 13, n); } static inline u32 msg_peer_net_hash(struct tipc_msg *m) { return msg_word(m, 13); } /* Word 14 */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); } static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n) { msg_set_word(m, 14, n); } static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id) { memcpy(msg_data(hdr), id, 16); } static inline u8 *msg_node_id(struct tipc_msg *hdr) { return (u8 *)msg_data(hdr); } struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, u32 dnode, bool *new_bundle); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy); static inline u16 buf_seqno(struct sk_buff *skb) { return msg_seqno(buf_msg(skb)); } static inline int buf_roundup_len(struct sk_buff *skb) { return (skb->len / 1024 + 1) * 1024; } /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any */ static inline struct sk_buff *tipc_skb_peek(struct sk_buff_head *list, spinlock_t *lock) { struct sk_buff *skb; spin_lock_bh(lock); skb = skb_peek(list); if (skb) skb_get(skb); spin_unlock_bh(lock); return skb; } /* tipc_skb_peek_port(): find a destination port, ignoring all destinations * up to and including 'filter'. * Note: ignoring previously tried destinations minimizes the risk of * contention on the socket lock * @list: list to be peeked in * @filter: last destination to be ignored from search * Returns a destination port number, of applicable. */ static inline u32 tipc_skb_peek_port(struct sk_buff_head *list, u32 filter) { struct sk_buff *skb; u32 dport = 0; bool ignore = true; spin_lock_bh(&list->lock); skb_queue_walk(list, skb) { dport = msg_destport(buf_msg(skb)); if (!filter || skb_queue_is_last(list, skb)) break; if (dport == filter) ignore = false; else if (!ignore) break; } spin_unlock_bh(&list->lock); return dport; } /* tipc_skb_dequeue(): unlink first buffer with dest 'dport' from list * @list: list to be unlinked from * @dport: selection criteria for buffer to unlink */ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list, u32 dport) { struct sk_buff *_skb, *tmp, *skb = NULL; spin_lock_bh(&list->lock); skb_queue_walk_safe(list, _skb, tmp) { if (msg_destport(buf_msg(_skb)) == dport) { __skb_unlink(_skb, list); skb = _skb; break; } } spin_unlock_bh(&list->lock); return skb; } /* tipc_skb_queue_splice_tail - append an skb list to lock protected list * @list: the new list to append. Not lock protected * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list, struct sk_buff_head *head) { spin_lock_bh(&head->lock); skb_queue_splice_tail(list, head); spin_unlock_bh(&head->lock); } /* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists * @list: the new list to add. Lock protected. Will be reinitialized * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { struct sk_buff_head tmp; __skb_queue_head_init(&tmp); spin_lock_bh(&list->lock); skb_queue_splice_tail_init(list, &tmp); spin_unlock_bh(&list->lock); tipc_skb_queue_splice_tail(&tmp, head); } /* __tipc_skb_dequeue() - dequeue the head skb according to expected seqno * @list: list to be dequeued from * @seqno: seqno of the expected msg * * returns skb dequeued from the list if its seqno is less than or equal to * the expected one, otherwise the skb is still hold * * Note: must be used with appropriate locks held only */ static inline struct sk_buff *__tipc_skb_dequeue(struct sk_buff_head *list, u16 seqno) { struct sk_buff *skb = skb_peek(list); if (skb && less_eq(buf_seqno(skb), seqno)) { __skb_unlink(skb, list); return skb; } return NULL; } #endif
2 5 38 3 132 132 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 /* SPDX-License-Identifier: GPL-2.0-only */ /* include/net/xdp.h * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ #include <linux/bitfield.h> #include <linux/filter.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* skb_shared_info */ /** * DOC: XDP RX-queue information * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP * data-path read-access to RX-info for both kernel and bpf-side * (limited subset). * * For now, direct access is only safe while running in NAPI/softirq * context. Contents are read-mostly and must not be updated during * driver NAPI/softirq poll. * * The driver usage API is a register and unregister API. * * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. */ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; /* XDP flags for ndo_xdp_xmit */ #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; }; struct page_pool; struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; struct xdp_mem_info mem; unsigned int napi_id; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { struct net_device *dev; }; enum xdp_buff_flags { XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ }; struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_HAS_FRAGS; } static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) { xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { xdp->frame_sz = frame_sz; xdp->rxq = rxq; xdp->flags = 0; } static __always_inline void xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, int headroom, int data_len, const bool meta_valid) { unsigned char *data = hard_start + headroom; xdp->data_hard_start = hard_start; xdp->data = data; xdp->data_end = data + data_len; xdp->data_meta = meta_valid ? data : data + 1; } /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the * XDP/BPF data access to data_hard_end. Notice same area (and size) * is used for XDP_PASS, when constructing the SKB via build_skb(). */ #define xdp_data_hard_end(xdp) \ ((xdp)->data_hard_start + (xdp)->frame_sz - \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * xdp_get_shared_info_from_buff(struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); len += sinfo->xdp_frags_size; out: return len; } struct xdp_frame { void *data; u16 len; u16 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem info is valid on remote CPU. */ struct xdp_mem_info mem; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; void *xa; void *q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { /* bq->count will be zero'ed when bq->xa gets updated */ bq->xa = NULL; } static inline struct skb_shared_info * xdp_get_shared_info_from_frame(struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); return (struct skb_shared_info *)(data_hard_start + frame->frame_sz - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); } struct xdp_cpumap_stats { unsigned int redirect; unsigned int pass; unsigned int drop; }; /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { frame->data = NULL; frame->dev_rx = NULL; } static inline void xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, unsigned int size, unsigned int truesize, bool pfmemalloc) { skb_shinfo(skb)->nr_frags = nr_frags; skb->len += size; skb->data_len += size; skb->truesize += truesize; skb->pfmemalloc |= pfmemalloc; } /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; xdp->data_end = frame->data + frame->len; xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; xdp->flags = frame->flags; } static inline int xdp_update_frame_from_buff(struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return -ENOSPC; /* Catch if driver didn't reserve tailroom for skb_shared_info */ if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { XDP_WARN("Driver BUG: missing reserved tailroom"); return -ENOSPC; } xdp_frame->data = xdp->data; xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; xdp_frame->frame_sz = xdp->frame_sz; xdp_frame->flags = xdp->flags; return 0; } /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) { struct xdp_frame *xdp_frame; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ xdp_frame->mem = xdp->rxq->mem; return xdp_frame; } void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); len += sinfo->xdp_frags_size; out: return len; } int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id, u32 frag_size); static inline int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id) { return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0); } void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. */ static __always_inline void xdp_set_data_meta_invalid(struct xdp_buff *xdp) { xdp->data_meta = xdp->data + 1; } static __always_inline bool xdp_data_meta_unsupported(const struct xdp_buff *xdp) { return unlikely(xdp->data_meta > xdp->data); } static inline bool xdp_metalen_invalid(unsigned long metalen) { unsigned long meta_max; meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); BUILD_BUG_ON(!__builtin_constant_p(meta_max)); return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { struct bpf_prog *prog; u32 flags; }; struct netdev_bpf; void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE /* Define the relationship between xdp-rx-metadata kfunc and * various other entities: * - xdp_rx_metadata enum * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) * - kfunc name * - xdp_metadata_ops field */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ NETDEV_XDP_RX_METADATA_TIMESTAMP, \ bpf_xdp_metadata_rx_timestamp, \ xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ NETDEV_XDP_RX_METADATA_VLAN_TAG, \ bpf_xdp_metadata_rx_vlan_tag, \ xmo_rx_vlan_tag) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, }; enum xdp_rss_hash_type { /* First part: Individual bits for L3/L4 types */ XDP_RSS_L3_IPV4 = BIT(0), XDP_RSS_L3_IPV6 = BIT(1), /* The fixed (L3) IPv4 and IPv6 headers can both be followed by * variable/dynamic headers, IPv4 called Options and IPv6 called * Extension Headers. HW RSS type can contain this info. */ XDP_RSS_L3_DYNHDR = BIT(2), /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in * addition to the protocol specific bit. This ease interaction with * SKBs and avoids reserving a fixed mask for future L4 protocol bits. */ XDP_RSS_L4 = BIT(3), /* L4 based hash, proto can be unknown */ XDP_RSS_L4_TCP = BIT(4), XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, XDP_RSS_TYPE_L2 = XDP_RSS_TYPE_NONE, XDP_RSS_TYPE_L3_IPV4 = XDP_RSS_L3_IPV4, XDP_RSS_TYPE_L3_IPV6 = XDP_RSS_L3_IPV6, XDP_RSS_TYPE_L3_IPV4_OPT = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L3_IPV6_EX = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_ANY = XDP_RSS_L4, XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, }; struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci); }; #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } static inline void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) { } static inline void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) { } static inline void xdp_features_clear_redirect_target(struct net_device *dev) { } #endif static inline void xdp_clear_features_flag(struct net_device *dev) { xdp_set_features_flag(dev, 0); } static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) act = xdp_master_redirect(xdp); } return act; } #endif /* __LINUX_NET_XDP_H__ */
45341 3187 12982 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H #define HAVE_JUMP_LABEL_BATCH #include <asm/asm.h> #include <asm/nops.h> #ifndef __ASSEMBLY__ #include <linux/stringify.h> #include <linux/types.h> #define JUMP_TABLE_ENTRY \ ".pushsection __jump_table, \"aw\" \n\t" \ _ASM_ALIGN "\n\t" \ ".long 1b - . \n\t" \ ".long %l[l_yes] - . \n\t" \ _ASM_PTR "%c0 + %c1 - .\n\t" \ ".popsection \n\t" #ifdef CONFIG_HAVE_JUMP_LABEL_HACK static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm goto("1:" "jmp %l[l_yes] # objtool NOPs this \n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (2 | branch) : : l_yes); return false; l_yes: return true; } #else /* !CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { asm goto("1:" ".byte " __stringify(BYTES_NOP5) "\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); return false; l_yes: return true; } #endif /* CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { asm goto("1:" "jmp %l[l_yes]\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); return false; l_yes: return true; } extern int arch_jump_entry_size(struct jump_entry *entry); #endif /* __ASSEMBLY__ */ #endif
38 20 68 26 149 149 20 145 1343 1342 401 363 12 7 30 392 394 14 384 22 384 22 84 74 10 3 84 326 84 401 394 58 394 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux network device link state notification * * Author: * Stefan Rompf <sux@loplof.de> */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/if.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <linux/rtnetlink.h> #include <linux/jiffies.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/types.h> #include "dev.h" enum lw_bits { LW_URGENT = 0, }; static unsigned long linkwatch_flags; static unsigned long linkwatch_nextevent; static void linkwatch_event(struct work_struct *dummy); static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; /* Some uppers (DSA) have additional sources for being down, so * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { int iflink = dev_get_iflink(dev); struct net_device *peer; if (iflink == dev->ifindex) return IF_OPER_DOWN; peer = __dev_get_by_index(dev_net(dev), iflink); if (!peer) return IF_OPER_DOWN; return netif_carrier_ok(peer) ? IF_OPER_DOWN : IF_OPER_LOWERLAYERDOWN; } if (netif_dormant(dev)) return IF_OPER_DORMANT; return IF_OPER_UP; } static void rfc2863_policy(struct net_device *dev) { unsigned char operstate = default_operstate(dev); if (operstate == dev->operstate) return; write_lock(&dev_base_lock); switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) operstate = IF_OPER_TESTING; break; case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; case IF_LINK_MODE_DEFAULT: default: break; } dev->operstate = operstate; write_unlock(&dev_base_lock); } void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ if (!netif_carrier_ok(dev) || netif_dormant(dev) || netif_testing(dev)) rfc2863_policy(dev); } static bool linkwatch_urgent_event(struct net_device *dev) { if (!netif_running(dev)) return false; if (dev->ifindex != dev_get_iflink(dev)) return true; if (netif_is_lag_port(dev) || netif_is_lag_master(dev)) return true; return netif_carrier_ok(dev) && qdisc_tx_changing(dev); } static void linkwatch_add_event(struct net_device *dev) { unsigned long flags; spin_lock_irqsave(&lweventlist_lock, flags); if (list_empty(&dev->link_watch_list)) { list_add_tail(&dev->link_watch_list, &lweventlist); netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC); } spin_unlock_irqrestore(&lweventlist_lock, flags); } static void linkwatch_schedule_work(int urgent) { unsigned long delay = linkwatch_nextevent - jiffies; if (test_bit(LW_URGENT, &linkwatch_flags)) return; /* Minimise down-time: drop delay for up event. */ if (urgent) { if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) return; delay = 0; } /* If we wrap around we'll delay it by at most HZ. */ if (delay > HZ) delay = 0; /* * If urgent, schedule immediate execution; otherwise, don't * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) mod_delayed_work(system_wq, &linkwatch_work, 0); else schedule_delayed_work(&linkwatch_work, delay); } static void linkwatch_do_dev(struct net_device *dev) { /* * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted */ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); rfc2863_policy(dev); if (dev->flags & IFF_UP) { if (netif_carrier_ok(dev)) dev_activate(dev); else dev_deactivate(dev); netdev_state_change(dev); } /* Note: our callers are responsible for calling netdev_tracker_free(). * This is the reason we use __dev_put() instead of dev_put(). */ __dev_put(dev); } static void __linkwatch_run_queue(int urgent_only) { #define MAX_DO_DEV_PER_LOOP 100 int do_dev = MAX_DO_DEV_PER_LOOP; /* Use a local list here since we add non-urgent * events back to the global one when called with * urgent_only=1. */ LIST_HEAD(wrk); /* Give urgent case more budget */ if (urgent_only) do_dev += MAX_DO_DEV_PER_LOOP; /* * Limit the number of linkwatch events to one * per second so that a runaway driver does not * cause a storm of messages on the netlink * socket. This limit does not apply to up events * while the device qdisc is down. */ if (!urgent_only) linkwatch_nextevent = jiffies + HZ; /* Limit wrap-around effect on delay. */ else if (time_after(linkwatch_nextevent, jiffies + HZ)) linkwatch_nextevent = jiffies; clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); list_splice_init(&lweventlist, &wrk); while (!list_empty(&wrk) && do_dev > 0) { struct net_device *dev; dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); if (!netif_device_present(dev) || (urgent_only && !linkwatch_urgent_event(dev))) { list_add_tail(&dev->link_watch_list, &lweventlist); continue; } /* We must free netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); linkwatch_do_dev(dev); do_dev--; spin_lock_irq(&lweventlist_lock); } /* Add the remaining work back to lweventlist */ list_splice_init(&wrk, &lweventlist); if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); } void linkwatch_sync_dev(struct net_device *dev) { unsigned long flags; int clean = 0; spin_lock_irqsave(&lweventlist_lock, flags); if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); clean = 1; /* We must release netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); if (clean) linkwatch_do_dev(dev); } /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) { __linkwatch_run_queue(0); } static void linkwatch_event(struct work_struct *dummy) { rtnl_lock(); __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies)); rtnl_unlock(); } void linkwatch_fire_event(struct net_device *dev) { bool urgent = linkwatch_urgent_event(dev); if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { linkwatch_add_event(dev); } else if (!urgent) return; linkwatch_schedule_work(urgent); } EXPORT_SYMBOL(linkwatch_fire_event);
1 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com> * * Based off of net/unix/unix_bpf.c */ #include <linux/bpf.h> #include <linux/module.h> #include <linux/skmsg.h> #include <linux/socket.h> #include <linux/wait.h> #include <net/af_vsock.h> #include <net/sock.h> #define vsock_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&(__sk)->sk_receive_queue) || \ !skb_queue_empty(&(__psock)->ingress_skb) || \ !list_empty(&(__psock)->ingress_msg); \ }) static struct proto *vsock_prot_saved __read_mostly; static DEFINE_SPINLOCK(vsock_prot_lock); static struct proto vsock_bpf_prot; static bool vsock_has_data(struct sock *sk, struct sk_psock *psock) { struct vsock_sock *vsk = vsock_sk(sk); s64 ret; ret = vsock_connectible_has_data(vsk); if (ret > 0) return true; return vsock_sk_has_data(sk, psock); } static bool vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { bool ret; DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk->sk_shutdown & RCV_SHUTDOWN) return true; if (!timeo) return false; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = vsock_has_data(sk, psock); if (!ret) { wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); ret = vsock_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int __vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { struct socket *sock = sk->sk_socket; int err; if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) err = vsock_connectible_recvmsg(sock, msg, len, flags); else if (sk->sk_type == SOCK_DGRAM) err = vsock_dgram_recvmsg(sock, msg, len, flags); else err = -EPROTOTYPE; return err; } static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; int copied; psock = sk_psock_get(sk); if (unlikely(!psock)) return __vsock_recvmsg(sk, msg, len, flags); lock_sock(sk); if (vsock_has_data(sk, psock) && sk_psock_queue_empty(psock)) { release_sock(sk); sk_psock_put(sk, psock); return __vsock_recvmsg(sk, msg, len, flags); } copied = sk_msg_recvmsg(sk, psock, msg, len, flags); while (copied == 0) { long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); if (!vsock_msg_wait_data(sk, psock, timeo)) { copied = -EAGAIN; break; } if (sk_psock_queue_empty(psock)) { release_sock(sk); sk_psock_put(sk, psock); return __vsock_recvmsg(sk, msg, len, flags); } copied = sk_msg_recvmsg(sk, psock, msg, len, flags); } release_sock(sk); sk_psock_put(sk, psock); return copied; } /* Copy of original proto with updated sock_map methods */ static struct proto vsock_bpf_prot = { .close = sock_map_close, .recvmsg = vsock_bpf_recvmsg, .sock_is_readable = sk_msg_is_readable, .unhash = sock_map_unhash, }; static void vsock_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = vsock_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void vsock_bpf_check_needs_rebuild(struct proto *ops) { /* Paired with the smp_store_release() below. */ if (unlikely(ops != smp_load_acquire(&vsock_prot_saved))) { spin_lock_bh(&vsock_prot_lock); if (likely(ops != vsock_prot_saved)) { vsock_bpf_rebuild_protos(&vsock_bpf_prot, ops); /* Make sure proto function pointers are updated before publishing the * pointer to the struct. */ smp_store_release(&vsock_prot_saved, ops); } spin_unlock_bh(&vsock_prot_lock); } } int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { struct vsock_sock *vsk; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } vsk = vsock_sk(sk); if (!vsk->transport) return -ENODEV; if (!vsk->transport->read_skb) return -EOPNOTSUPP; vsock_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &vsock_bpf_prot); return 0; } void __init vsock_bpf_build_proto(void) { vsock_bpf_rebuild_protos(&vsock_bpf_prot, &vsock_proto); }
4 4 4 3 3 4 4 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Destination Hashing scheduling module * * Authors: Wensong Zhang <wensong@gnuchina.org> * * Inspired by the consistent hashing scheduler patch from * Thomas Proell <proellt@gmx.de> * * Changes: */ /* * The dh algorithm is to select server by the hash key of destination IP * address. The pseudo code is as follows: * * n <- servernode[dest_ip]; * if (n is dead) OR * (n is overloaded) OR (n.weight <= 0) then * return NULL; * * return n; * * Notes that servernode is a 256-bucket hash table that maps the hash * index derived from packet destination IP address to the current server * array. If the dh scheduler is used in cache cluster, it is good to * combine it with cache_bypass feature. When the statically assigned * server is dead or overloaded, the load balancer can bypass the cache * server and send requests to the original server directly. * */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/ip.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/hash.h> #include <net/ip_vs.h> /* * IPVS DH bucket */ struct ip_vs_dh_bucket { struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* * for IPVS DH entry hash table */ #ifndef CONFIG_IP_VS_DH_TAB_BITS #define CONFIG_IP_VS_DH_TAB_BITS 8 #endif #define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS #define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) #define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) struct ip_vs_dh_state { struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE]; struct rcu_head rcu_head; }; /* * Returns hash value for IPVS DH entry */ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS); } /* * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr) { return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest); } /* * Assign all the hash buckets of the specified table with the service. */ static int ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_dh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; bool empty; b = &s->buckets[0]; p = &svc->destinations; empty = list_empty(p); for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { dest = rcu_dereference_protected(b->dest, 1); if (dest) ip_vs_dest_put(dest); if (empty) RCU_INIT_POINTER(b->dest, NULL); else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); ip_vs_dest_hold(dest); RCU_INIT_POINTER(b->dest, dest); p = p->next; } b++; } return 0; } /* * Flush all the hash buckets of the specified table. */ static void ip_vs_dh_flush(struct ip_vs_dh_state *s) { int i; struct ip_vs_dh_bucket *b; struct ip_vs_dest *dest; b = &s->buckets[0]; for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { dest = rcu_dereference_protected(b->dest, 1); if (dest) { ip_vs_dest_put(dest); RCU_INIT_POINTER(b->dest, NULL); } b++; } } static int ip_vs_dh_init_svc(struct ip_vs_service *svc) { struct ip_vs_dh_state *s; /* allocate the DH table for this service */ s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); if (s == NULL) return -ENOMEM; svc->sched_data = s; IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); /* assign the hash buckets with current dests */ ip_vs_dh_reassign(s, svc); return 0; } static void ip_vs_dh_done_svc(struct ip_vs_service *svc) { struct ip_vs_dh_state *s = svc->sched_data; /* got to clean up hash buckets here */ ip_vs_dh_flush(s); /* release the table itself */ kfree_rcu(s, rcu_head); IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); } static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, struct ip_vs_dest *dest) { struct ip_vs_dh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ ip_vs_dh_reassign(s, svc); return 0; } /* * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, * consider that the server is overloaded here. */ static inline int is_overloaded(struct ip_vs_dest *dest) { return dest->flags & IP_VS_DEST_F_OVERLOAD; } /* * Destination hashing scheduling */ static struct ip_vs_dest * ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; struct ip_vs_dh_state *s; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); s = (struct ip_vs_dh_state *) svc->sched_data; dest = ip_vs_dh_get(svc->af, s, &iph->daddr); if (!dest || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest)) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } /* * IPVS DH Scheduler structure */ static struct ip_vs_scheduler ip_vs_dh_scheduler = { .name = "dh", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), .init_service = ip_vs_dh_init_svc, .done_service = ip_vs_dh_done_svc, .add_dest = ip_vs_dh_dest_changed, .del_dest = ip_vs_dh_dest_changed, .schedule = ip_vs_dh_schedule, }; static int __init ip_vs_dh_init(void) { return register_ip_vs_scheduler(&ip_vs_dh_scheduler); } static void __exit ip_vs_dh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); synchronize_rcu(); } module_init(ip_vs_dh_init); module_exit(ip_vs_dh_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs destination hashing scheduler");
40 40 1 1 14 38 1 25 3 9 2 1 4 1 3 1 24 24 17 2 8 3 12 1 10 1 5 2 2 7 2 9 9 3 1 6 1 6 11 1 1 1 44 2 28 11 3 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel CIPSO/IPv4 Support * * This file defines the CIPSO/IPv4 functions for the NetLabel system. The * NetLabel system manages static and dynamic label mappings for network * protocols such as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 */ #include <linux/types.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <linux/atomic.h> #include "netlabel_user.h" #include "netlabel_cipso_v4.h" #include "netlabel_mgmt.h" #include "netlabel_domainhash.h" /* Argument struct for cipso_v4_doi_walk() */ struct netlbl_cipsov4_doiwalk_arg { struct netlink_callback *nl_cb; struct sk_buff *skb; u32 seq; }; /* Argument struct for netlbl_domhsh_walk() */ struct netlbl_domhsh_walk_arg { struct netlbl_audit *audit_info; u32 doi; }; /* NetLabel Generic NETLINK CIPSOv4 family */ static struct genl_family netlbl_cipsov4_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = { [NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_MTYPE] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_TAG] = { .type = NLA_U8 }, [NLBL_CIPSOV4_A_TAGLST] = { .type = NLA_NESTED }, [NLBL_CIPSOV4_A_MLSLVLLOC] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_MLSLVLREM] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_MLSLVL] = { .type = NLA_NESTED }, [NLBL_CIPSOV4_A_MLSLVLLST] = { .type = NLA_NESTED }, [NLBL_CIPSOV4_A_MLSCATLOC] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_MLSCATREM] = { .type = NLA_U32 }, [NLBL_CIPSOV4_A_MLSCAT] = { .type = NLA_NESTED }, [NLBL_CIPSOV4_A_MLSCATLST] = { .type = NLA_NESTED }, }; /* * Helper Functions */ /** * netlbl_cipsov4_add_common - Parse the common sections of a ADD message * @info: the Generic NETLINK info block * @doi_def: the CIPSO V4 DOI definition * * Description: * Parse the common sections of a ADD message and fill in the related values * in @doi_def. Returns zero on success, negative values on failure. * */ static int netlbl_cipsov4_add_common(struct genl_info *info, struct cipso_v4_doi *doi_def) { struct nlattr *nla; int nla_rem; u32 iter = 0; doi_def->doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); if (nla_validate_nested_deprecated(info->attrs[NLBL_CIPSOV4_A_TAGLST], NLBL_CIPSOV4_A_MAX, netlbl_cipsov4_genl_policy, NULL) != 0) return -EINVAL; nla_for_each_nested(nla, info->attrs[NLBL_CIPSOV4_A_TAGLST], nla_rem) if (nla_type(nla) == NLBL_CIPSOV4_A_TAG) { if (iter >= CIPSO_V4_TAG_MAXCNT) return -EINVAL; doi_def->tags[iter++] = nla_get_u8(nla); } while (iter < CIPSO_V4_TAG_MAXCNT) doi_def->tags[iter++] = CIPSO_V4_TAG_INVALID; return 0; } /* * NetLabel Command Handlers */ /** * netlbl_cipsov4_add_std - Adds a CIPSO V4 DOI definition * @info: the Generic NETLINK info block * @audit_info: NetLabel audit information * * Description: * Create a new CIPSO_V4_MAP_TRANS DOI definition based on the given ADD * message and add it to the CIPSO V4 engine. Return zero on success and * non-zero on error. * */ static int netlbl_cipsov4_add_std(struct genl_info *info, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; struct cipso_v4_doi *doi_def = NULL; struct nlattr *nla_a; struct nlattr *nla_b; int nla_a_rem; int nla_b_rem; u32 iter; if (!info->attrs[NLBL_CIPSOV4_A_TAGLST] || !info->attrs[NLBL_CIPSOV4_A_MLSLVLLST]) return -EINVAL; if (nla_validate_nested_deprecated(info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], NLBL_CIPSOV4_A_MAX, netlbl_cipsov4_genl_policy, NULL) != 0) return -EINVAL; doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); if (doi_def == NULL) return -ENOMEM; doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL); if (doi_def->map.std == NULL) { kfree(doi_def); return -ENOMEM; } doi_def->type = CIPSO_V4_MAP_TRANS; ret_val = netlbl_cipsov4_add_common(info, doi_def); if (ret_val != 0) goto add_std_failure; ret_val = -EINVAL; nla_for_each_nested(nla_a, info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) { if (nla_validate_nested_deprecated(nla_a, NLBL_CIPSOV4_A_MAX, netlbl_cipsov4_genl_policy, NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_b, nla_a, nla_b_rem) switch (nla_type(nla_b)) { case NLBL_CIPSOV4_A_MLSLVLLOC: if (nla_get_u32(nla_b) > CIPSO_V4_MAX_LOC_LVLS) goto add_std_failure; if (nla_get_u32(nla_b) >= doi_def->map.std->lvl.local_size) doi_def->map.std->lvl.local_size = nla_get_u32(nla_b) + 1; break; case NLBL_CIPSOV4_A_MLSLVLREM: if (nla_get_u32(nla_b) > CIPSO_V4_MAX_REM_LVLS) goto add_std_failure; if (nla_get_u32(nla_b) >= doi_def->map.std->lvl.cipso_size) doi_def->map.std->lvl.cipso_size = nla_get_u32(nla_b) + 1; break; } } doi_def->map.std->lvl.local = kcalloc(doi_def->map.std->lvl.local_size, sizeof(u32), GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; } doi_def->map.std->lvl.cipso = kcalloc(doi_def->map.std->lvl.cipso_size, sizeof(u32), GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; } for (iter = 0; iter < doi_def->map.std->lvl.local_size; iter++) doi_def->map.std->lvl.local[iter] = CIPSO_V4_INV_LVL; for (iter = 0; iter < doi_def->map.std->lvl.cipso_size; iter++) doi_def->map.std->lvl.cipso[iter] = CIPSO_V4_INV_LVL; nla_for_each_nested(nla_a, info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) { struct nlattr *lvl_loc; struct nlattr *lvl_rem; lvl_loc = nla_find_nested(nla_a, NLBL_CIPSOV4_A_MLSLVLLOC); lvl_rem = nla_find_nested(nla_a, NLBL_CIPSOV4_A_MLSLVLREM); if (lvl_loc == NULL || lvl_rem == NULL) goto add_std_failure; doi_def->map.std->lvl.local[nla_get_u32(lvl_loc)] = nla_get_u32(lvl_rem); doi_def->map.std->lvl.cipso[nla_get_u32(lvl_rem)] = nla_get_u32(lvl_loc); } if (info->attrs[NLBL_CIPSOV4_A_MLSCATLST]) { if (nla_validate_nested_deprecated(info->attrs[NLBL_CIPSOV4_A_MLSCATLST], NLBL_CIPSOV4_A_MAX, netlbl_cipsov4_genl_policy, NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_a, info->attrs[NLBL_CIPSOV4_A_MLSCATLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) { if (nla_validate_nested_deprecated(nla_a, NLBL_CIPSOV4_A_MAX, netlbl_cipsov4_genl_policy, NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_b, nla_a, nla_b_rem) switch (nla_type(nla_b)) { case NLBL_CIPSOV4_A_MLSCATLOC: if (nla_get_u32(nla_b) > CIPSO_V4_MAX_LOC_CATS) goto add_std_failure; if (nla_get_u32(nla_b) >= doi_def->map.std->cat.local_size) doi_def->map.std->cat.local_size = nla_get_u32(nla_b) + 1; break; case NLBL_CIPSOV4_A_MLSCATREM: if (nla_get_u32(nla_b) > CIPSO_V4_MAX_REM_CATS) goto add_std_failure; if (nla_get_u32(nla_b) >= doi_def->map.std->cat.cipso_size) doi_def->map.std->cat.cipso_size = nla_get_u32(nla_b) + 1; break; } } doi_def->map.std->cat.local = kcalloc( doi_def->map.std->cat.local_size, sizeof(u32), GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; } doi_def->map.std->cat.cipso = kcalloc( doi_def->map.std->cat.cipso_size, sizeof(u32), GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; } for (iter = 0; iter < doi_def->map.std->cat.local_size; iter++) doi_def->map.std->cat.local[iter] = CIPSO_V4_INV_CAT; for (iter = 0; iter < doi_def->map.std->cat.cipso_size; iter++) doi_def->map.std->cat.cipso[iter] = CIPSO_V4_INV_CAT; nla_for_each_nested(nla_a, info->attrs[NLBL_CIPSOV4_A_MLSCATLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) { struct nlattr *cat_loc; struct nlattr *cat_rem; cat_loc = nla_find_nested(nla_a, NLBL_CIPSOV4_A_MLSCATLOC); cat_rem = nla_find_nested(nla_a, NLBL_CIPSOV4_A_MLSCATREM); if (cat_loc == NULL || cat_rem == NULL) goto add_std_failure; doi_def->map.std->cat.local[ nla_get_u32(cat_loc)] = nla_get_u32(cat_rem); doi_def->map.std->cat.cipso[ nla_get_u32(cat_rem)] = nla_get_u32(cat_loc); } } ret_val = cipso_v4_doi_add(doi_def, audit_info); if (ret_val != 0) goto add_std_failure; return 0; add_std_failure: cipso_v4_doi_free(doi_def); return ret_val; } /** * netlbl_cipsov4_add_pass - Adds a CIPSO V4 DOI definition * @info: the Generic NETLINK info block * @audit_info: NetLabel audit information * * Description: * Create a new CIPSO_V4_MAP_PASS DOI definition based on the given ADD message * and add it to the CIPSO V4 engine. Return zero on success and non-zero on * error. * */ static int netlbl_cipsov4_add_pass(struct genl_info *info, struct netlbl_audit *audit_info) { int ret_val; struct cipso_v4_doi *doi_def = NULL; if (!info->attrs[NLBL_CIPSOV4_A_TAGLST]) return -EINVAL; doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); if (doi_def == NULL) return -ENOMEM; doi_def->type = CIPSO_V4_MAP_PASS; ret_val = netlbl_cipsov4_add_common(info, doi_def); if (ret_val != 0) goto add_pass_failure; ret_val = cipso_v4_doi_add(doi_def, audit_info); if (ret_val != 0) goto add_pass_failure; return 0; add_pass_failure: cipso_v4_doi_free(doi_def); return ret_val; } /** * netlbl_cipsov4_add_local - Adds a CIPSO V4 DOI definition * @info: the Generic NETLINK info block * @audit_info: NetLabel audit information * * Description: * Create a new CIPSO_V4_MAP_LOCAL DOI definition based on the given ADD * message and add it to the CIPSO V4 engine. Return zero on success and * non-zero on error. * */ static int netlbl_cipsov4_add_local(struct genl_info *info, struct netlbl_audit *audit_info) { int ret_val; struct cipso_v4_doi *doi_def = NULL; if (!info->attrs[NLBL_CIPSOV4_A_TAGLST]) return -EINVAL; doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); if (doi_def == NULL) return -ENOMEM; doi_def->type = CIPSO_V4_MAP_LOCAL; ret_val = netlbl_cipsov4_add_common(info, doi_def); if (ret_val != 0) goto add_local_failure; ret_val = cipso_v4_doi_add(doi_def, audit_info); if (ret_val != 0) goto add_local_failure; return 0; add_local_failure: cipso_v4_doi_free(doi_def); return ret_val; } /** * netlbl_cipsov4_add - Handle an ADD message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Create a new DOI definition based on the given ADD message and add it to the * CIPSO V4 engine. Returns zero on success, negative values on failure. * */ static int netlbl_cipsov4_add(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct netlbl_audit audit_info; if (!info->attrs[NLBL_CIPSOV4_A_DOI] || !info->attrs[NLBL_CIPSOV4_A_MTYPE]) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CIPSOV4_A_MTYPE])) { case CIPSO_V4_MAP_TRANS: ret_val = netlbl_cipsov4_add_std(info, &audit_info); break; case CIPSO_V4_MAP_PASS: ret_val = netlbl_cipsov4_add_pass(info, &audit_info); break; case CIPSO_V4_MAP_LOCAL: ret_val = netlbl_cipsov4_add_local(info, &audit_info); break; } if (ret_val == 0) atomic_inc(&netlabel_mgmt_protocount); return ret_val; } /** * netlbl_cipsov4_list - Handle a LIST message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated LIST message and respond accordingly. While the * response message generated by the kernel is straightforward, determining * before hand the size of the buffer to allocate is not (we have to generate * the message to know the size). In order to keep this function sane what we * do is allocate a buffer of NLMSG_GOODSIZE and try to fit the response in * that size, if we fail then we restart with a larger buffer and try again. * We continue in this manner until we hit a limit of failed attempts then we * give up and just send an error message. Returns zero on success and * negative values on error. * */ static int netlbl_cipsov4_list(struct sk_buff *skb, struct genl_info *info) { int ret_val; struct sk_buff *ans_skb = NULL; u32 nlsze_mult = 1; void *data; u32 doi; struct nlattr *nla_a; struct nlattr *nla_b; struct cipso_v4_doi *doi_def; u32 iter; if (!info->attrs[NLBL_CIPSOV4_A_DOI]) { ret_val = -EINVAL; goto list_failure; } list_start: ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE * nlsze_mult, GFP_KERNEL); if (ans_skb == NULL) { ret_val = -ENOMEM; goto list_failure; } data = genlmsg_put_reply(ans_skb, info, &netlbl_cipsov4_gnl_family, 0, NLBL_CIPSOV4_C_LIST); if (data == NULL) { ret_val = -ENOMEM; goto list_failure; } doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); rcu_read_lock(); doi_def = cipso_v4_doi_getdef(doi); if (doi_def == NULL) { ret_val = -EINVAL; goto list_failure_lock; } ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MTYPE, doi_def->type); if (ret_val != 0) goto list_failure_lock; nla_a = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_TAGLST); if (nla_a == NULL) { ret_val = -ENOMEM; goto list_failure_lock; } for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT && doi_def->tags[iter] != CIPSO_V4_TAG_INVALID; iter++) { ret_val = nla_put_u8(ans_skb, NLBL_CIPSOV4_A_TAG, doi_def->tags[iter]); if (ret_val != 0) goto list_failure_lock; } nla_nest_end(ans_skb, nla_a); switch (doi_def->type) { case CIPSO_V4_MAP_TRANS: nla_a = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_MLSLVLLST); if (nla_a == NULL) { ret_val = -ENOMEM; goto list_failure_lock; } for (iter = 0; iter < doi_def->map.std->lvl.local_size; iter++) { if (doi_def->map.std->lvl.local[iter] == CIPSO_V4_INV_LVL) continue; nla_b = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_MLSLVL); if (nla_b == NULL) { ret_val = -ENOMEM; goto list_retry; } ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MLSLVLLOC, iter); if (ret_val != 0) goto list_retry; ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MLSLVLREM, doi_def->map.std->lvl.local[iter]); if (ret_val != 0) goto list_retry; nla_nest_end(ans_skb, nla_b); } nla_nest_end(ans_skb, nla_a); nla_a = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_MLSCATLST); if (nla_a == NULL) { ret_val = -ENOMEM; goto list_retry; } for (iter = 0; iter < doi_def->map.std->cat.local_size; iter++) { if (doi_def->map.std->cat.local[iter] == CIPSO_V4_INV_CAT) continue; nla_b = nla_nest_start_noflag(ans_skb, NLBL_CIPSOV4_A_MLSCAT); if (nla_b == NULL) { ret_val = -ENOMEM; goto list_retry; } ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MLSCATLOC, iter); if (ret_val != 0) goto list_retry; ret_val = nla_put_u32(ans_skb, NLBL_CIPSOV4_A_MLSCATREM, doi_def->map.std->cat.local[iter]); if (ret_val != 0) goto list_retry; nla_nest_end(ans_skb, nla_b); } nla_nest_end(ans_skb, nla_a); break; } cipso_v4_doi_putdef(doi_def); rcu_read_unlock(); genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); list_retry: /* XXX - this limit is a guesstimate */ if (nlsze_mult < 4) { cipso_v4_doi_putdef(doi_def); rcu_read_unlock(); kfree_skb(ans_skb); nlsze_mult *= 2; goto list_start; } list_failure_lock: cipso_v4_doi_putdef(doi_def); rcu_read_unlock(); list_failure: kfree_skb(ans_skb); return ret_val; } /** * netlbl_cipsov4_listall_cb - cipso_v4_doi_walk() callback for LISTALL * @doi_def: the CIPSOv4 DOI definition * @arg: the netlbl_cipsov4_doiwalk_arg structure * * Description: * This function is designed to be used as a callback to the * cipso_v4_doi_walk() function for use in generating a response for a LISTALL * message. Returns the size of the message on success, negative values on * failure. * */ static int netlbl_cipsov4_listall_cb(struct cipso_v4_doi *doi_def, void *arg) { int ret_val = -ENOMEM; struct netlbl_cipsov4_doiwalk_arg *cb_arg = arg; void *data; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_cipsov4_gnl_family, NLM_F_MULTI, NLBL_CIPSOV4_C_LISTALL); if (data == NULL) goto listall_cb_failure; ret_val = nla_put_u32(cb_arg->skb, NLBL_CIPSOV4_A_DOI, doi_def->doi); if (ret_val != 0) goto listall_cb_failure; ret_val = nla_put_u32(cb_arg->skb, NLBL_CIPSOV4_A_MTYPE, doi_def->type); if (ret_val != 0) goto listall_cb_failure; genlmsg_end(cb_arg->skb, data); return 0; listall_cb_failure: genlmsg_cancel(cb_arg->skb, data); return ret_val; } /** * netlbl_cipsov4_listall - Handle a LISTALL message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated LISTALL message and respond accordingly. Returns * zero on success and negative values on error. * */ static int netlbl_cipsov4_listall(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_cipsov4_doiwalk_arg cb_arg; u32 doi_skip = cb->args[0]; cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; cipso_v4_doi_walk(&doi_skip, netlbl_cipsov4_listall_cb, &cb_arg); cb->args[0] = doi_skip; return skb->len; } /** * netlbl_cipsov4_remove_cb - netlbl_cipsov4_remove() callback for REMOVE * @entry: LSM domain mapping entry * @arg: the netlbl_domhsh_walk_arg structure * * Description: * This function is intended for use by netlbl_cipsov4_remove() as the callback * for the netlbl_domhsh_walk() function; it removes LSM domain map entries * which are associated with the CIPSO DOI specified in @arg. Returns zero on * success, negative values on failure. * */ static int netlbl_cipsov4_remove_cb(struct netlbl_dom_map *entry, void *arg) { struct netlbl_domhsh_walk_arg *cb_arg = arg; if (entry->def.type == NETLBL_NLTYPE_CIPSOV4 && entry->def.cipso->doi == cb_arg->doi) return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info); return 0; } /** * netlbl_cipsov4_remove - Handle a REMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated REMOVE message and respond accordingly. Returns * zero on success, negative values on failure. * */ static int netlbl_cipsov4_remove(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct netlbl_domhsh_walk_arg cb_arg; struct netlbl_audit audit_info; u32 skip_bkt = 0; u32 skip_chain = 0; if (!info->attrs[NLBL_CIPSOV4_A_DOI]) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CIPSOV4_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, netlbl_cipsov4_remove_cb, &cb_arg); if (ret_val == 0 || ret_val == -ENOENT) { ret_val = cipso_v4_doi_remove(cb_arg.doi, &audit_info); if (ret_val == 0) atomic_dec(&netlabel_mgmt_protocount); } return ret_val; } /* * NetLabel Generic NETLINK Command Definitions */ static const struct genl_small_ops netlbl_cipsov4_ops[] = { { .cmd = NLBL_CIPSOV4_C_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_cipsov4_add, .dumpit = NULL, }, { .cmd = NLBL_CIPSOV4_C_REMOVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_cipsov4_remove, .dumpit = NULL, }, { .cmd = NLBL_CIPSOV4_C_LIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = netlbl_cipsov4_list, .dumpit = NULL, }, { .cmd = NLBL_CIPSOV4_C_LISTALL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_cipsov4_listall, }, }; static struct genl_family netlbl_cipsov4_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_CIPSOV4_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_CIPSOV4_A_MAX, .policy = netlbl_cipsov4_genl_policy, .module = THIS_MODULE, .small_ops = netlbl_cipsov4_ops, .n_small_ops = ARRAY_SIZE(netlbl_cipsov4_ops), .resv_start_op = NLBL_CIPSOV4_C_LISTALL + 1, }; /* * NetLabel Generic NETLINK Protocol Functions */ /** * netlbl_cipsov4_genl_init - Register the CIPSOv4 NetLabel component * * Description: * Register the CIPSOv4 packet NetLabel component with the Generic NETLINK * mechanism. Returns zero on success, negative values on failure. * */ int __init netlbl_cipsov4_genl_init(void) { return genl_register_family(&netlbl_cipsov4_gnl_family); }
4 2 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 /* * linux/fs/nls/nls_cp874.c * * Charset cp874 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2026, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 0x90*/ 0x0000, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 0xa0*/ 0x00a0, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07, 0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f, /* 0xb0*/ 0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17, 0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f, /* 0xc0*/ 0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27, 0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f, /* 0xd0*/ 0x0e30, 0x0e31, 0x0e32, 0x0e33, 0x0e34, 0x0e35, 0x0e36, 0x0e37, 0x0e38, 0x0e39, 0x0e3a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0e3f, /* 0xe0*/ 0x0e40, 0x0e41, 0x0e42, 0x0e43, 0x0e44, 0x0e45, 0x0e46, 0x0e47, 0x0e48, 0x0e49, 0x0e4a, 0x0e4b, 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f, /* 0xf0*/ 0x0e50, 0x0e51, 0x0e52, 0x0e53, 0x0e54, 0x0e55, 0x0e56, 0x0e57, 0x0e58, 0x0e59, 0x0e5a, 0x0e5b, 0x0000, 0x0000, 0x0000, 0x0000, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char page0e[256] = { 0x00, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0x00-0x07 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0x08-0x0f */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0x10-0x17 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0x18-0x1f */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0x20-0x27 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0x28-0x2f */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0x30-0x37 */ 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0x38-0x3f */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0x40-0x47 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0x48-0x4f */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0x50-0x57 */ 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x96, 0x97, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x91, 0x92, 0x00, 0x00, 0x93, 0x94, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0x20-0x27 */ }; static const unsigned char *const page_uni2charset[256] = { page00, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page0e, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0x00, 0x00, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp874", .alias = "tis-620", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp874(void) { return register_nls(&table); } static void __exit exit_nls_cp874(void) { unregister_nls(&table); } module_init(init_nls_cp874) module_exit(exit_nls_cp874) MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NLS(tis-620);
2 2 1 1 1 1 2 1 1 2 1 2 2 2 2 2 2 1 1 1 1 3 3 2 1 3 2 2 2 2 2 2 13 2 13 13 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright(c) 1999 - 2004 Intel Corporation. All rights reserved. */ #include <linux/skbuff.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/if_bonding.h> #include <linux/pkt_sched.h> #include <net/net_namespace.h> #include <net/bonding.h> #include <net/bond_3ad.h> #include <net/netlink.h> /* General definitions */ #define AD_SHORT_TIMEOUT 1 #define AD_LONG_TIMEOUT 0 #define AD_STANDBY 0x2 #define AD_MAX_TX_IN_SECOND 3 #define AD_COLLECTOR_MAX_DELAY 0 /* Timer definitions (43.4.4 in the 802.3ad standard) */ #define AD_FAST_PERIODIC_TIME 1 #define AD_SLOW_PERIODIC_TIME 30 #define AD_SHORT_TIMEOUT_TIME (3*AD_FAST_PERIODIC_TIME) #define AD_LONG_TIMEOUT_TIME (3*AD_SLOW_PERIODIC_TIME) #define AD_CHURN_DETECTION_TIME 60 #define AD_AGGREGATE_WAIT_TIME 2 /* Port Variables definitions used by the State Machines (43.4.7 in the * 802.3ad standard) */ #define AD_PORT_BEGIN 0x1 #define AD_PORT_LACP_ENABLED 0x2 #define AD_PORT_ACTOR_CHURN 0x4 #define AD_PORT_PARTNER_CHURN 0x8 #define AD_PORT_READY 0x10 #define AD_PORT_READY_N 0x20 #define AD_PORT_MATCHED 0x40 #define AD_PORT_STANDBY 0x80 #define AD_PORT_SELECTED 0x100 #define AD_PORT_MOVED 0x200 #define AD_PORT_CHURNED (AD_PORT_ACTOR_CHURN | AD_PORT_PARTNER_CHURN) /* Port Key definitions * key is determined according to the link speed, duplex and * user key (which is yet not supported) * -------------------------------------------------------------- * Port key | User key (10 bits) | Speed (5 bits) | Duplex| * -------------------------------------------------------------- * |15 6|5 1|0 */ #define AD_DUPLEX_KEY_MASKS 0x1 #define AD_SPEED_KEY_MASKS 0x3E #define AD_USER_KEY_MASKS 0xFFC0 enum ad_link_speed_type { AD_LINK_SPEED_1MBPS = 1, AD_LINK_SPEED_10MBPS, AD_LINK_SPEED_100MBPS, AD_LINK_SPEED_1000MBPS, AD_LINK_SPEED_2500MBPS, AD_LINK_SPEED_5000MBPS, AD_LINK_SPEED_10000MBPS, AD_LINK_SPEED_14000MBPS, AD_LINK_SPEED_20000MBPS, AD_LINK_SPEED_25000MBPS, AD_LINK_SPEED_40000MBPS, AD_LINK_SPEED_50000MBPS, AD_LINK_SPEED_56000MBPS, AD_LINK_SPEED_100000MBPS, AD_LINK_SPEED_200000MBPS, AD_LINK_SPEED_400000MBPS, AD_LINK_SPEED_800000MBPS, }; /* compare MAC addresses */ #define MAC_ADDRESS_EQUAL(A, B) \ ether_addr_equal_64bits((const u8 *)A, (const u8 *)B) static const u8 null_mac_addr[ETH_ALEN + 2] __long_aligned = { 0, 0, 0, 0, 0, 0 }; static const u16 ad_ticks_per_sec = 1000 / AD_TIMER_INTERVAL; static const int ad_delta_in_ticks = (AD_TIMER_INTERVAL * HZ) / 1000; const u8 lacpdu_mcast_addr[ETH_ALEN + 2] __long_aligned = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x02 }; /* ================= main 802.3ad protocol functions ================== */ static int ad_lacpdu_send(struct port *port); static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port, bool *update_slave_arr); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); static void ad_periodic_machine(struct port *port, struct bond_params *bond_params); static void ad_port_selection_logic(struct port *port, bool *update_slave_arr); static void ad_agg_selection_logic(struct aggregator *aggregator, bool *update_slave_arr); static void ad_clear_agg(struct aggregator *aggregator); static void ad_initialize_agg(struct aggregator *aggregator); static void ad_initialize_port(struct port *port, int lacp_fast); static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr); static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr); static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port); static void ad_marker_response_received(struct bond_marker *marker, struct port *port); static void ad_update_actor_keys(struct port *port, bool reset); /* ================= api to bonding and kernel code ================== */ /** * __get_bond_by_port - get the port's bonding struct * @port: the port we're looking at * * Return @port's bonding struct, or %NULL if it can't be found. */ static inline struct bonding *__get_bond_by_port(struct port *port) { if (port->slave == NULL) return NULL; return bond_get_bond_by_slave(port->slave); } /** * __get_first_agg - get the first aggregator in the bond * @port: the port we're looking at * * Return the aggregator of the first slave in @bond, or %NULL if it can't be * found. * The caller must hold RCU or RTNL lock. */ static inline struct aggregator *__get_first_agg(struct port *port) { struct bonding *bond = __get_bond_by_port(port); struct slave *first_slave; struct aggregator *agg; /* If there's no bond for this port, or bond has no slaves */ if (bond == NULL) return NULL; rcu_read_lock(); first_slave = bond_first_slave_rcu(bond); agg = first_slave ? &(SLAVE_AD_INFO(first_slave)->aggregator) : NULL; rcu_read_unlock(); return agg; } /** * __agg_has_partner - see if we have a partner * @agg: the agregator we're looking at * * Return nonzero if aggregator has a partner (denoted by a non-zero ether * address for the partner). Return 0 if not. */ static inline int __agg_has_partner(struct aggregator *agg) { return !is_zero_ether_addr(agg->partner_system.mac_addr_value); } /** * __disable_port - disable the port's slave * @port: the port we're looking at */ static inline void __disable_port(struct port *port) { bond_set_slave_inactive_flags(port->slave, BOND_SLAVE_NOTIFY_LATER); } /** * __enable_port - enable the port's slave, if it's up * @port: the port we're looking at */ static inline void __enable_port(struct port *port) { struct slave *slave = port->slave; if ((slave->link == BOND_LINK_UP) && bond_slave_is_up(slave)) bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_LATER); } /** * __port_is_enabled - check if the port's slave is in active state * @port: the port we're looking at */ static inline int __port_is_enabled(struct port *port) { return bond_is_active_slave(port->slave); } /** * __get_agg_selection_mode - get the aggregator selection mode * @port: the port we're looking at * * Get the aggregator selection mode. Can be %STABLE, %BANDWIDTH or %COUNT. */ static inline u32 __get_agg_selection_mode(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) return BOND_AD_STABLE; return bond->params.ad_select; } /** * __check_agg_selection_timer - check if the selection timer has expired * @port: the port we're looking at */ static inline int __check_agg_selection_timer(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) return 0; return atomic_read(&BOND_AD_INFO(bond).agg_select_timer) ? 1 : 0; } /** * __get_link_speed - get a port's speed * @port: the port we're looking at * * Return @port's speed in 802.3ad enum format. i.e. one of: * 0, * %AD_LINK_SPEED_10MBPS, * %AD_LINK_SPEED_100MBPS, * %AD_LINK_SPEED_1000MBPS, * %AD_LINK_SPEED_2500MBPS, * %AD_LINK_SPEED_5000MBPS, * %AD_LINK_SPEED_10000MBPS * %AD_LINK_SPEED_14000MBPS, * %AD_LINK_SPEED_20000MBPS * %AD_LINK_SPEED_25000MBPS * %AD_LINK_SPEED_40000MBPS * %AD_LINK_SPEED_50000MBPS * %AD_LINK_SPEED_56000MBPS * %AD_LINK_SPEED_100000MBPS * %AD_LINK_SPEED_200000MBPS * %AD_LINK_SPEED_400000MBPS * %AD_LINK_SPEED_800000MBPS */ static u16 __get_link_speed(struct port *port) { struct slave *slave = port->slave; u16 speed; /* this if covers only a special case: when the configuration starts * with link down, it sets the speed to 0. * This is done in spite of the fact that the e100 driver reports 0 * to be compatible with MVT in the future. */ if (slave->link != BOND_LINK_UP) speed = 0; else { switch (slave->speed) { case SPEED_10: speed = AD_LINK_SPEED_10MBPS; break; case SPEED_100: speed = AD_LINK_SPEED_100MBPS; break; case SPEED_1000: speed = AD_LINK_SPEED_1000MBPS; break; case SPEED_2500: speed = AD_LINK_SPEED_2500MBPS; break; case SPEED_5000: speed = AD_LINK_SPEED_5000MBPS; break; case SPEED_10000: speed = AD_LINK_SPEED_10000MBPS; break; case SPEED_14000: speed = AD_LINK_SPEED_14000MBPS; break; case SPEED_20000: speed = AD_LINK_SPEED_20000MBPS; break; case SPEED_25000: speed = AD_LINK_SPEED_25000MBPS; break; case SPEED_40000: speed = AD_LINK_SPEED_40000MBPS; break; case SPEED_50000: speed = AD_LINK_SPEED_50000MBPS; break; case SPEED_56000: speed = AD_LINK_SPEED_56000MBPS; break; case SPEED_100000: speed = AD_LINK_SPEED_100000MBPS; break; case SPEED_200000: speed = AD_LINK_SPEED_200000MBPS; break; case SPEED_400000: speed = AD_LINK_SPEED_400000MBPS; break; case SPEED_800000: speed = AD_LINK_SPEED_800000MBPS; break; default: /* unknown speed value from ethtool. shouldn't happen */ if (slave->speed != SPEED_UNKNOWN) pr_err_once("%s: (slave %s): unknown ethtool speed (%d) for port %d (set it to 0)\n", slave->bond->dev->name, slave->dev->name, slave->speed, port->actor_port_number); speed = 0; break; } } slave_dbg(slave->bond->dev, slave->dev, "Port %d Received link speed %d update from adapter\n", port->actor_port_number, speed); return speed; } /** * __get_duplex - get a port's duplex * @port: the port we're looking at * * Return @port's duplex in 802.3ad bitmask format. i.e.: * 0x01 if in full duplex * 0x00 otherwise */ static u8 __get_duplex(struct port *port) { struct slave *slave = port->slave; u8 retval = 0x0; /* handling a special case: when the configuration starts with * link down, it sets the duplex to 0. */ if (slave->link == BOND_LINK_UP) { switch (slave->duplex) { case DUPLEX_FULL: retval = 0x1; slave_dbg(slave->bond->dev, slave->dev, "Port %d Received status full duplex update from adapter\n", port->actor_port_number); break; case DUPLEX_HALF: default: retval = 0x0; slave_dbg(slave->bond->dev, slave->dev, "Port %d Received status NOT full duplex update from adapter\n", port->actor_port_number); break; } } return retval; } static void __ad_actor_update_port(struct port *port) { const struct bonding *bond = bond_get_bond_by_slave(port->slave); port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr; port->actor_system_priority = BOND_AD_INFO(bond).system.sys_priority; } /* Conversions */ /** * __ad_timer_to_ticks - convert a given timer type to AD module ticks * @timer_type: which timer to operate * @par: timer parameter. see below * * If @timer_type is %current_while_timer, @par indicates long/short timer. * If @timer_type is %periodic_timer, @par is one of %FAST_PERIODIC_TIME, * %SLOW_PERIODIC_TIME. */ static u16 __ad_timer_to_ticks(u16 timer_type, u16 par) { u16 retval = 0; /* to silence the compiler */ switch (timer_type) { case AD_CURRENT_WHILE_TIMER: /* for rx machine usage */ if (par) retval = (AD_SHORT_TIMEOUT_TIME*ad_ticks_per_sec); else retval = (AD_LONG_TIMEOUT_TIME*ad_ticks_per_sec); break; case AD_ACTOR_CHURN_TIMER: /* for local churn machine */ retval = (AD_CHURN_DETECTION_TIME*ad_ticks_per_sec); break; case AD_PERIODIC_TIMER: /* for periodic machine */ retval = (par*ad_ticks_per_sec); /* long timeout */ break; case AD_PARTNER_CHURN_TIMER: /* for remote churn machine */ retval = (AD_CHURN_DETECTION_TIME*ad_ticks_per_sec); break; case AD_WAIT_WHILE_TIMER: /* for selection machine */ retval = (AD_AGGREGATE_WAIT_TIME*ad_ticks_per_sec); break; } return retval; } /* ================= ad_rx_machine helper functions ================== */ /** * __choose_matched - update a port's matched variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Update the value of the matched variable, using parameter values from a * newly received lacpdu. Parameter values for the partner carried in the * received PDU are compared with the corresponding operational parameter * values for the actor. Matched is set to TRUE if all of these parameters * match and the PDU parameter partner_state.aggregation has the same value as * actor_oper_port_state.aggregation and lacp will actively maintain the link * in the aggregation. Matched is also set to TRUE if the value of * actor_state.aggregation in the received PDU is set to FALSE, i.e., indicates * an individual link and lacp will actively maintain the link. Otherwise, * matched is set to FALSE. LACP is considered to be actively maintaining the * link if either the PDU's actor_state.lacp_activity variable is TRUE or both * the actor's actor_oper_port_state.lacp_activity and the PDU's * partner_state.lacp_activity variables are TRUE. * * Note: the AD_PORT_MATCHED "variable" is not specified by 802.3ad; it is * used here to implement the language from 802.3ad 43.4.9 that requires * recordPDU to "match" the LACPDU parameters to the stored values. */ static void __choose_matched(struct lacpdu *lacpdu, struct port *port) { /* check if all parameters are alike * or this is individual link(aggregation == FALSE) * then update the state machine Matched variable. */ if (((ntohs(lacpdu->partner_port) == port->actor_port_number) && (ntohs(lacpdu->partner_port_priority) == port->actor_port_priority) && MAC_ADDRESS_EQUAL(&(lacpdu->partner_system), &(port->actor_system)) && (ntohs(lacpdu->partner_system_priority) == port->actor_system_priority) && (ntohs(lacpdu->partner_key) == port->actor_oper_port_key) && ((lacpdu->partner_state & LACP_STATE_AGGREGATION) == (port->actor_oper_port_state & LACP_STATE_AGGREGATION))) || ((lacpdu->actor_state & LACP_STATE_AGGREGATION) == 0) ) { port->sm_vars |= AD_PORT_MATCHED; } else { port->sm_vars &= ~AD_PORT_MATCHED; } } /** * __record_pdu - record parameters from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Record the parameter values for the Actor carried in a received lacpdu as * the current partner operational parameter values and sets * actor_oper_port_state.defaulted to FALSE. */ static void __record_pdu(struct lacpdu *lacpdu, struct port *port) { if (lacpdu && port) { struct port_params *partner = &port->partner_oper; __choose_matched(lacpdu, port); /* record the new parameter values for the partner * operational */ partner->port_number = ntohs(lacpdu->actor_port); partner->port_priority = ntohs(lacpdu->actor_port_priority); partner->system = lacpdu->actor_system; partner->system_priority = ntohs(lacpdu->actor_system_priority); partner->key = ntohs(lacpdu->actor_key); partner->port_state = lacpdu->actor_state; /* set actor_oper_port_state.defaulted to FALSE */ port->actor_oper_port_state &= ~LACP_STATE_DEFAULTED; /* set the partner sync. to on if the partner is sync, * and the port is matched */ if ((port->sm_vars & AD_PORT_MATCHED) && (lacpdu->actor_state & LACP_STATE_SYNCHRONIZATION)) { partner->port_state |= LACP_STATE_SYNCHRONIZATION; slave_dbg(port->slave->bond->dev, port->slave->dev, "partner sync=1\n"); } else { partner->port_state &= ~LACP_STATE_SYNCHRONIZATION; slave_dbg(port->slave->bond->dev, port->slave->dev, "partner sync=0\n"); } } } /** * __record_default - record default parameters * @port: the port we're looking at * * This function records the default parameter values for the partner carried * in the Partner Admin parameters as the current partner operational parameter * values and sets actor_oper_port_state.defaulted to TRUE. */ static void __record_default(struct port *port) { if (port) { /* record the partner admin parameters */ memcpy(&port->partner_oper, &port->partner_admin, sizeof(struct port_params)); /* set actor_oper_port_state.defaulted to true */ port->actor_oper_port_state |= LACP_STATE_DEFAULTED; } } /** * __update_selected - update a port's Selected variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Update the value of the selected variable, using parameter values from a * newly received lacpdu. The parameter values for the Actor carried in the * received PDU are compared with the corresponding operational parameter * values for the ports partner. If one or more of the comparisons shows that * the value(s) received in the PDU differ from the current operational values, * then selected is set to FALSE and actor_oper_port_state.synchronization is * set to out_of_sync. Otherwise, selected remains unchanged. */ static void __update_selected(struct lacpdu *lacpdu, struct port *port) { if (lacpdu && port) { const struct port_params *partner = &port->partner_oper; /* check if any parameter is different then * update the state machine selected variable. */ if (ntohs(lacpdu->actor_port) != partner->port_number || ntohs(lacpdu->actor_port_priority) != partner->port_priority || !MAC_ADDRESS_EQUAL(&lacpdu->actor_system, &partner->system) || ntohs(lacpdu->actor_system_priority) != partner->system_priority || ntohs(lacpdu->actor_key) != partner->key || (lacpdu->actor_state & LACP_STATE_AGGREGATION) != (partner->port_state & LACP_STATE_AGGREGATION)) { port->sm_vars &= ~AD_PORT_SELECTED; } } } /** * __update_default_selected - update a port's Selected variable from Partner * @port: the port we're looking at * * This function updates the value of the selected variable, using the partner * administrative parameter values. The administrative values are compared with * the corresponding operational parameter values for the partner. If one or * more of the comparisons shows that the administrative value(s) differ from * the current operational values, then Selected is set to FALSE and * actor_oper_port_state.synchronization is set to OUT_OF_SYNC. Otherwise, * Selected remains unchanged. */ static void __update_default_selected(struct port *port) { if (port) { const struct port_params *admin = &port->partner_admin; const struct port_params *oper = &port->partner_oper; /* check if any parameter is different then * update the state machine selected variable. */ if (admin->port_number != oper->port_number || admin->port_priority != oper->port_priority || !MAC_ADDRESS_EQUAL(&admin->system, &oper->system) || admin->system_priority != oper->system_priority || admin->key != oper->key || (admin->port_state & LACP_STATE_AGGREGATION) != (oper->port_state & LACP_STATE_AGGREGATION)) { port->sm_vars &= ~AD_PORT_SELECTED; } } } /** * __update_ntt - update a port's ntt variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Updates the value of the ntt variable, using parameter values from a newly * received lacpdu. The parameter values for the partner carried in the * received PDU are compared with the corresponding operational parameter * values for the Actor. If one or more of the comparisons shows that the * value(s) received in the PDU differ from the current operational values, * then ntt is set to TRUE. Otherwise, ntt remains unchanged. */ static void __update_ntt(struct lacpdu *lacpdu, struct port *port) { /* validate lacpdu and port */ if (lacpdu && port) { /* check if any parameter is different then * update the port->ntt. */ if ((ntohs(lacpdu->partner_port) != port->actor_port_number) || (ntohs(lacpdu->partner_port_priority) != port->actor_port_priority) || !MAC_ADDRESS_EQUAL(&(lacpdu->partner_system), &(port->actor_system)) || (ntohs(lacpdu->partner_system_priority) != port->actor_system_priority) || (ntohs(lacpdu->partner_key) != port->actor_oper_port_key) || ((lacpdu->partner_state & LACP_STATE_LACP_ACTIVITY) != (port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY)) || ((lacpdu->partner_state & LACP_STATE_LACP_TIMEOUT) != (port->actor_oper_port_state & LACP_STATE_LACP_TIMEOUT)) || ((lacpdu->partner_state & LACP_STATE_SYNCHRONIZATION) != (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) || ((lacpdu->partner_state & LACP_STATE_AGGREGATION) != (port->actor_oper_port_state & LACP_STATE_AGGREGATION)) ) { port->ntt = true; } } } /** * __agg_ports_are_ready - check if all ports in an aggregator are ready * @aggregator: the aggregator we're looking at * */ static int __agg_ports_are_ready(struct aggregator *aggregator) { struct port *port; int retval = 1; if (aggregator) { /* scan all ports in this aggregator to verfy if they are * all ready. */ for (port = aggregator->lag_ports; port; port = port->next_port_in_aggregator) { if (!(port->sm_vars & AD_PORT_READY_N)) { retval = 0; break; } } } return retval; } /** * __set_agg_ports_ready - set value of Ready bit in all ports of an aggregator * @aggregator: the aggregator we're looking at * @val: Should the ports' ready bit be set on or off * */ static void __set_agg_ports_ready(struct aggregator *aggregator, int val) { struct port *port; for (port = aggregator->lag_ports; port; port = port->next_port_in_aggregator) { if (val) port->sm_vars |= AD_PORT_READY; else port->sm_vars &= ~AD_PORT_READY; } } static int __agg_active_ports(struct aggregator *agg) { struct port *port; int active = 0; for (port = agg->lag_ports; port; port = port->next_port_in_aggregator) { if (port->is_enabled) active++; } return active; } /** * __get_agg_bandwidth - get the total bandwidth of an aggregator * @aggregator: the aggregator we're looking at * */ static u32 __get_agg_bandwidth(struct aggregator *aggregator) { int nports = __agg_active_ports(aggregator); u32 bandwidth = 0; if (nports) { switch (__get_link_speed(aggregator->lag_ports)) { case AD_LINK_SPEED_1MBPS: bandwidth = nports; break; case AD_LINK_SPEED_10MBPS: bandwidth = nports * 10; break; case AD_LINK_SPEED_100MBPS: bandwidth = nports * 100; break; case AD_LINK_SPEED_1000MBPS: bandwidth = nports * 1000; break; case AD_LINK_SPEED_2500MBPS: bandwidth = nports * 2500; break; case AD_LINK_SPEED_5000MBPS: bandwidth = nports * 5000; break; case AD_LINK_SPEED_10000MBPS: bandwidth = nports * 10000; break; case AD_LINK_SPEED_14000MBPS: bandwidth = nports * 14000; break; case AD_LINK_SPEED_20000MBPS: bandwidth = nports * 20000; break; case AD_LINK_SPEED_25000MBPS: bandwidth = nports * 25000; break; case AD_LINK_SPEED_40000MBPS: bandwidth = nports * 40000; break; case AD_LINK_SPEED_50000MBPS: bandwidth = nports * 50000; break; case AD_LINK_SPEED_56000MBPS: bandwidth = nports * 56000; break; case AD_LINK_SPEED_100000MBPS: bandwidth = nports * 100000; break; case AD_LINK_SPEED_200000MBPS: bandwidth = nports * 200000; break; case AD_LINK_SPEED_400000MBPS: bandwidth = nports * 400000; break; case AD_LINK_SPEED_800000MBPS: bandwidth = nports * 800000; break; default: bandwidth = 0; /* to silence the compiler */ } } return bandwidth; } /** * __get_active_agg - get the current active aggregator * @aggregator: the aggregator we're looking at * * Caller must hold RCU lock. */ static struct aggregator *__get_active_agg(struct aggregator *aggregator) { struct bonding *bond = aggregator->slave->bond; struct list_head *iter; struct slave *slave; bond_for_each_slave_rcu(bond, slave, iter) if (SLAVE_AD_INFO(slave)->aggregator.is_active) return &(SLAVE_AD_INFO(slave)->aggregator); return NULL; } /** * __update_lacpdu_from_port - update a port's lacpdu fields * @port: the port we're looking at */ static inline void __update_lacpdu_from_port(struct port *port) { struct lacpdu *lacpdu = &port->lacpdu; const struct port_params *partner = &port->partner_oper; /* update current actual Actor parameters * lacpdu->subtype initialized * lacpdu->version_number initialized * lacpdu->tlv_type_actor_info initialized * lacpdu->actor_information_length initialized */ lacpdu->actor_system_priority = htons(port->actor_system_priority); lacpdu->actor_system = port->actor_system; lacpdu->actor_key = htons(port->actor_oper_port_key); lacpdu->actor_port_priority = htons(port->actor_port_priority); lacpdu->actor_port = htons(port->actor_port_number); lacpdu->actor_state = port->actor_oper_port_state; slave_dbg(port->slave->bond->dev, port->slave->dev, "update lacpdu: actor port state %x\n", port->actor_oper_port_state); /* lacpdu->reserved_3_1 initialized * lacpdu->tlv_type_partner_info initialized * lacpdu->partner_information_length initialized */ lacpdu->partner_system_priority = htons(partner->system_priority); lacpdu->partner_system = partner->system; lacpdu->partner_key = htons(partner->key); lacpdu->partner_port_priority = htons(partner->port_priority); lacpdu->partner_port = htons(partner->port_number); lacpdu->partner_state = partner->port_state; /* lacpdu->reserved_3_2 initialized * lacpdu->tlv_type_collector_info initialized * lacpdu->collector_information_length initialized * collector_max_delay initialized * reserved_12[12] initialized * tlv_type_terminator initialized * terminator_length initialized * reserved_50[50] initialized */ } /* ================= main 802.3ad protocol code ========================= */ /** * ad_lacpdu_send - send out a lacpdu packet on a given port * @port: the port we're looking at * * Returns: 0 on success * < 0 on error */ static int ad_lacpdu_send(struct port *port) { struct slave *slave = port->slave; struct sk_buff *skb; struct lacpdu_header *lacpdu_header; int length = sizeof(struct lacpdu_header); skb = dev_alloc_skb(length); if (!skb) return -ENOMEM; atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.lacpdu_tx); skb->dev = slave->dev; skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; skb->priority = TC_PRIO_CONTROL; lacpdu_header = skb_put(skb, length); ether_addr_copy(lacpdu_header->hdr.h_dest, lacpdu_mcast_addr); /* Note: source address is set to be the member's PERMANENT address, * because we use it to identify loopback lacpdus in receive. */ ether_addr_copy(lacpdu_header->hdr.h_source, slave->perm_hwaddr); lacpdu_header->hdr.h_proto = PKT_TYPE_LACPDU; lacpdu_header->lacpdu = port->lacpdu; dev_queue_xmit(skb); return 0; } /** * ad_marker_send - send marker information/response on a given port * @port: the port we're looking at * @marker: marker data to send * * Returns: 0 on success * < 0 on error */ static int ad_marker_send(struct port *port, struct bond_marker *marker) { struct slave *slave = port->slave; struct sk_buff *skb; struct bond_marker_header *marker_header; int length = sizeof(struct bond_marker_header); skb = dev_alloc_skb(length + 16); if (!skb) return -ENOMEM; switch (marker->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.marker_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.marker_tx); break; case AD_MARKER_RESPONSE_SUBTYPE: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.marker_resp_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.marker_resp_tx); break; } skb_reserve(skb, 16); skb->dev = slave->dev; skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; marker_header = skb_put(skb, length); ether_addr_copy(marker_header->hdr.h_dest, lacpdu_mcast_addr); /* Note: source address is set to be the member's PERMANENT address, * because we use it to identify loopback MARKERs in receive. */ ether_addr_copy(marker_header->hdr.h_source, slave->perm_hwaddr); marker_header->hdr.h_proto = PKT_TYPE_LACPDU; marker_header->marker = *marker; dev_queue_xmit(skb); return 0; } /** * ad_mux_machine - handle a port's mux state machine * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_mux_machine(struct port *port, bool *update_slave_arr) { mux_states_t last_state; /* keep current State Machine state to compare later if it was * changed */ last_state = port->sm_mux_state; if (port->sm_vars & AD_PORT_BEGIN) { port->sm_mux_state = AD_MUX_DETACHED; } else { switch (port->sm_mux_state) { case AD_MUX_DETACHED: if ((port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY)) /* if SELECTED or STANDBY */ port->sm_mux_state = AD_MUX_WAITING; break; case AD_MUX_WAITING: /* if SELECTED == FALSE return to DETACH state */ if (!(port->sm_vars & AD_PORT_SELECTED)) { port->sm_vars &= ~AD_PORT_READY_N; /* in order to withhold the Selection Logic to * check all ports READY_N value every callback * cycle to update ready variable, we check * READY_N and update READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); port->sm_mux_state = AD_MUX_DETACHED; break; } /* check if the wait_while_timer expired */ if (port->sm_mux_timer_counter && !(--port->sm_mux_timer_counter)) port->sm_vars |= AD_PORT_READY_N; /* in order to withhold the selection logic to check * all ports READY_N value every callback cycle to * update ready variable, we check READY_N and update * READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); /* if the wait_while_timer expired, and the port is * in READY state, move to ATTACHED state */ if ((port->sm_vars & AD_PORT_READY) && !port->sm_mux_timer_counter) port->sm_mux_state = AD_MUX_ATTACHED; break; case AD_MUX_ATTACHED: /* check also if agg_select_timer expired (so the * edable port will take place only after this timer) */ if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && !__check_agg_selection_timer(port)) { if (port->aggregator->is_active) port->sm_mux_state = AD_MUX_COLLECTING_DISTRIBUTING; } else if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY)) { /* if UNSELECTED or STANDBY */ port->sm_vars &= ~AD_PORT_READY_N; /* in order to withhold the selection logic to * check all ports READY_N value every callback * cycle to update ready variable, we check * READY_N and update READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); port->sm_mux_state = AD_MUX_DETACHED; } else if (port->aggregator->is_active) { port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; } break; case AD_MUX_COLLECTING_DISTRIBUTING: if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY) || !(port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) || !(port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) { port->sm_mux_state = AD_MUX_ATTACHED; } else { /* if port state hasn't changed make * sure that a collecting distributing * port in an active aggregator is enabled */ if (port->aggregator && port->aggregator->is_active && !__port_is_enabled(port)) { __enable_port(port); *update_slave_arr = true; } } break; default: break; } } /* check if the state machine was changed */ if (port->sm_mux_state != last_state) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Mux Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_mux_state); switch (port->sm_mux_state) { case AD_MUX_DETACHED: port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; ad_disable_collecting_distributing(port, update_slave_arr); port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->ntt = true; break; case AD_MUX_WAITING: port->sm_mux_timer_counter = __ad_timer_to_ticks(AD_WAIT_WHILE_TIMER, 0); break; case AD_MUX_ATTACHED: if (port->aggregator->is_active) port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; else port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; ad_disable_collecting_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_COLLECTING_DISTRIBUTING: port->actor_oper_port_state |= LACP_STATE_COLLECTING; port->actor_oper_port_state |= LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting_distributing(port, update_slave_arr); port->ntt = true; break; default: break; } } } /** * ad_rx_machine - handle a port's rx State Machine * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * If lacpdu arrived, stop previous timer (if exists) and set the next state as * CURRENT. If timer expired set the state machine in the proper state. * In other cases, this function checks if we need to switch to other state. */ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) { rx_states_t last_state; /* keep current State Machine state to compare later if it was * changed */ last_state = port->sm_rx_state; if (lacpdu) { atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.lacpdu_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.lacpdu_rx); } /* check if state machine should change state */ /* first, check if port was reinitialized */ if (port->sm_vars & AD_PORT_BEGIN) { port->sm_rx_state = AD_RX_INITIALIZE; port->sm_vars |= AD_PORT_CHURNED; /* check if port is not enabled */ } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled) port->sm_rx_state = AD_RX_PORT_DISABLED; /* check if new lacpdu arrived */ else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) || (port->sm_rx_state == AD_RX_DEFAULTED) || (port->sm_rx_state == AD_RX_CURRENT))) { if (port->sm_rx_state != AD_RX_CURRENT) port->sm_vars |= AD_PORT_CHURNED; port->sm_rx_timer_counter = 0; port->sm_rx_state = AD_RX_CURRENT; } else { /* if timer is on, and if it is expired */ if (port->sm_rx_timer_counter && !(--port->sm_rx_timer_counter)) { switch (port->sm_rx_state) { case AD_RX_EXPIRED: port->sm_rx_state = AD_RX_DEFAULTED; break; case AD_RX_CURRENT: port->sm_rx_state = AD_RX_EXPIRED; break; default: break; } } else { /* if no lacpdu arrived and no timer is on */ switch (port->sm_rx_state) { case AD_RX_PORT_DISABLED: if (port->is_enabled && (port->sm_vars & AD_PORT_LACP_ENABLED)) port->sm_rx_state = AD_RX_EXPIRED; else if (port->is_enabled && ((port->sm_vars & AD_PORT_LACP_ENABLED) == 0)) port->sm_rx_state = AD_RX_LACP_DISABLED; break; default: break; } } } /* check if the State machine was changed or new lacpdu arrived */ if ((port->sm_rx_state != last_state) || (lacpdu)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Rx Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_rx_state); switch (port->sm_rx_state) { case AD_RX_INITIALIZE: if (!(port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS)) port->sm_vars &= ~AD_PORT_LACP_ENABLED; else port->sm_vars |= AD_PORT_LACP_ENABLED; port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; port->sm_rx_state = AD_RX_PORT_DISABLED; fallthrough; case AD_RX_PORT_DISABLED: port->sm_vars &= ~AD_PORT_MATCHED; break; case AD_RX_LACP_DISABLED: port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->partner_oper.port_state &= ~LACP_STATE_AGGREGATION; port->sm_vars |= AD_PORT_MATCHED; port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; case AD_RX_EXPIRED: /* Reset of the Synchronization flag (Standard 43.4.12) * This reset cause to disable this port in the * COLLECTING_DISTRIBUTING state of the mux machine in * case of EXPIRED even if LINK_DOWN didn't arrive for * the port. */ port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->sm_vars &= ~AD_PORT_MATCHED; port->partner_oper.port_state |= LACP_STATE_LACP_TIMEOUT; port->partner_oper.port_state |= LACP_STATE_LACP_ACTIVITY; port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(AD_SHORT_TIMEOUT)); port->actor_oper_port_state |= LACP_STATE_EXPIRED; port->sm_vars |= AD_PORT_CHURNED; break; case AD_RX_DEFAULTED: __update_default_selected(port); __record_default(port); port->sm_vars |= AD_PORT_MATCHED; port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; case AD_RX_CURRENT: /* detect loopback situation */ if (MAC_ADDRESS_EQUAL(&(lacpdu->actor_system), &(port->actor_system))) { slave_err(port->slave->bond->dev, port->slave->dev, "An illegal loopback occurred on slave\n" "Check the configuration to verify that all adapters are connected to 802.3ad compliant switch ports\n"); return; } __update_selected(lacpdu, port); __update_ntt(lacpdu, port); __record_pdu(lacpdu, port); port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(port->actor_oper_port_state & LACP_STATE_LACP_TIMEOUT)); port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; default: break; } } } /** * ad_churn_machine - handle port churn's state machine * @port: the port we're looking at * */ static void ad_churn_machine(struct port *port) { if (port->sm_vars & AD_PORT_CHURNED) { port->sm_vars &= ~AD_PORT_CHURNED; port->sm_churn_actor_state = AD_CHURN_MONITOR; port->sm_churn_partner_state = AD_CHURN_MONITOR; port->sm_churn_actor_timer_counter = __ad_timer_to_ticks(AD_ACTOR_CHURN_TIMER, 0); port->sm_churn_partner_timer_counter = __ad_timer_to_ticks(AD_PARTNER_CHURN_TIMER, 0); return; } if (port->sm_churn_actor_timer_counter && !(--port->sm_churn_actor_timer_counter) && port->sm_churn_actor_state == AD_CHURN_MONITOR) { if (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION) { port->sm_churn_actor_state = AD_NO_CHURN; } else { port->churn_actor_count++; port->sm_churn_actor_state = AD_CHURN; } } if (port->sm_churn_partner_timer_counter && !(--port->sm_churn_partner_timer_counter) && port->sm_churn_partner_state == AD_CHURN_MONITOR) { if (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) { port->sm_churn_partner_state = AD_NO_CHURN; } else { port->churn_partner_count++; port->sm_churn_partner_state = AD_CHURN; } } } /** * ad_tx_machine - handle a port's tx state machine * @port: the port we're looking at */ static void ad_tx_machine(struct port *port) { /* check if tx timer expired, to verify that we do not send more than * 3 packets per second */ if (port->sm_tx_timer_counter && !(--port->sm_tx_timer_counter)) { /* check if there is something to send */ if (port->ntt && (port->sm_vars & AD_PORT_LACP_ENABLED)) { __update_lacpdu_from_port(port); if (ad_lacpdu_send(port) >= 0) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Sent LACPDU on port %d\n", port->actor_port_number); /* mark ntt as false, so it will not be sent * again until demanded */ port->ntt = false; } } /* restart tx timer(to verify that we will not exceed * AD_MAX_TX_IN_SECOND */ port->sm_tx_timer_counter = ad_ticks_per_sec/AD_MAX_TX_IN_SECOND; } } /** * ad_periodic_machine - handle a port's periodic state machine * @port: the port we're looking at * @bond_params: bond parameters we will use * * Turn ntt flag on priodically to perform periodic transmission of lacpdu's. */ static void ad_periodic_machine(struct port *port, struct bond_params *bond_params) { periodic_states_t last_state; /* keep current state machine state to compare later if it was changed */ last_state = port->sm_periodic_state; /* check if port was reinitialized */ if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) || (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) || !bond_params->lacp_active) { port->sm_periodic_state = AD_NO_PERIODIC; } /* check if state machine should change state */ else if (port->sm_periodic_timer_counter) { /* check if periodic state machine expired */ if (!(--port->sm_periodic_timer_counter)) { /* if expired then do tx */ port->sm_periodic_state = AD_PERIODIC_TX; } else { /* If not expired, check if there is some new timeout * parameter from the partner state */ switch (port->sm_periodic_state) { case AD_FAST_PERIODIC: if (!(port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) port->sm_periodic_state = AD_SLOW_PERIODIC; break; case AD_SLOW_PERIODIC: if ((port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) { port->sm_periodic_timer_counter = 0; port->sm_periodic_state = AD_PERIODIC_TX; } break; default: break; } } } else { switch (port->sm_periodic_state) { case AD_NO_PERIODIC: port->sm_periodic_state = AD_FAST_PERIODIC; break; case AD_PERIODIC_TX: if (!(port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) port->sm_periodic_state = AD_SLOW_PERIODIC; else port->sm_periodic_state = AD_FAST_PERIODIC; break; default: break; } } /* check if the state machine was changed */ if (port->sm_periodic_state != last_state) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Periodic Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_periodic_state); switch (port->sm_periodic_state) { case AD_NO_PERIODIC: port->sm_periodic_timer_counter = 0; break; case AD_FAST_PERIODIC: /* decrement 1 tick we lost in the PERIODIC_TX cycle */ port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_FAST_PERIODIC_TIME))-1; break; case AD_SLOW_PERIODIC: /* decrement 1 tick we lost in the PERIODIC_TX cycle */ port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_SLOW_PERIODIC_TIME))-1; break; case AD_PERIODIC_TX: port->ntt = true; break; default: break; } } } /** * ad_port_selection_logic - select aggregation groups * @port: the port we're looking at * @update_slave_arr: Does slave array need update? * * Select aggregation groups, and assign each port for it's aggregetor. The * selection logic is called in the inititalization (after all the handshkes), * and after every lacpdu receive (if selected is off). */ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) { struct aggregator *aggregator, *free_aggregator = NULL, *temp_aggregator; struct port *last_port = NULL, *curr_port; struct list_head *iter; struct bonding *bond; struct slave *slave; int found = 0; /* if the port is already Selected, do nothing */ if (port->sm_vars & AD_PORT_SELECTED) return; bond = __get_bond_by_port(port); /* if the port is connected to other aggregator, detach it */ if (port->aggregator) { /* detach the port from its former aggregator */ temp_aggregator = port->aggregator; for (curr_port = temp_aggregator->lag_ports; curr_port; last_port = curr_port, curr_port = curr_port->next_port_in_aggregator) { if (curr_port == port) { temp_aggregator->num_of_ports--; /* if it is the first port attached to the * aggregator */ if (!last_port) { temp_aggregator->lag_ports = port->next_port_in_aggregator; } else { /* not the first port attached to the * aggregator */ last_port->next_port_in_aggregator = port->next_port_in_aggregator; } /* clear the port's relations to this * aggregator */ port->aggregator = NULL; port->next_port_in_aggregator = NULL; port->actor_port_aggregator_identifier = 0; slave_dbg(bond->dev, port->slave->dev, "Port %d left LAG %d\n", port->actor_port_number, temp_aggregator->aggregator_identifier); /* if the aggregator is empty, clear its * parameters, and set it ready to be attached */ if (!temp_aggregator->lag_ports) ad_clear_agg(temp_aggregator); break; } } if (!curr_port) { /* meaning: the port was related to an aggregator * but was not on the aggregator port list */ net_warn_ratelimited("%s: (slave %s): Warning: Port %d was related to aggregator %d but was not on its port list\n", port->slave->bond->dev->name, port->slave->dev->name, port->actor_port_number, port->aggregator->aggregator_identifier); } } /* search on all aggregators for a suitable aggregator for this port */ bond_for_each_slave(bond, slave, iter) { aggregator = &(SLAVE_AD_INFO(slave)->aggregator); /* keep a free aggregator for later use(if needed) */ if (!aggregator->lag_ports) { if (!free_aggregator) free_aggregator = aggregator; continue; } /* check if current aggregator suits us */ if (((aggregator->actor_oper_aggregator_key == port->actor_oper_port_key) && /* if all parameters match AND */ MAC_ADDRESS_EQUAL(&(aggregator->partner_system), &(port->partner_oper.system)) && (aggregator->partner_system_priority == port->partner_oper.system_priority) && (aggregator->partner_oper_aggregator_key == port->partner_oper.key) ) && ((!MAC_ADDRESS_EQUAL(&(port->partner_oper.system), &(null_mac_addr)) && /* partner answers */ !aggregator->is_individual) /* but is not individual OR */ ) ) { /* attach to the founded aggregator */ port->aggregator = aggregator; port->actor_port_aggregator_identifier = port->aggregator->aggregator_identifier; port->next_port_in_aggregator = aggregator->lag_ports; port->aggregator->num_of_ports++; aggregator->lag_ports = port; slave_dbg(bond->dev, slave->dev, "Port %d joined LAG %d (existing LAG)\n", port->actor_port_number, port->aggregator->aggregator_identifier); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; found = 1; break; } } /* the port couldn't find an aggregator - attach it to a new * aggregator */ if (!found) { if (free_aggregator) { /* assign port a new aggregator */ port->aggregator = free_aggregator; port->actor_port_aggregator_identifier = port->aggregator->aggregator_identifier; /* update the new aggregator's parameters * if port was responsed from the end-user */ if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS) /* if port is full duplex */ port->aggregator->is_individual = false; else port->aggregator->is_individual = true; port->aggregator->actor_admin_aggregator_key = port->actor_admin_port_key; port->aggregator->actor_oper_aggregator_key = port->actor_oper_port_key; port->aggregator->partner_system = port->partner_oper.system; port->aggregator->partner_system_priority = port->partner_oper.system_priority; port->aggregator->partner_oper_aggregator_key = port->partner_oper.key; port->aggregator->receive_state = 1; port->aggregator->transmit_state = 1; port->aggregator->lag_ports = port; port->aggregator->num_of_ports++; /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; slave_dbg(bond->dev, port->slave->dev, "Port %d joined LAG %d (new LAG)\n", port->actor_port_number, port->aggregator->aggregator_identifier); } else { slave_err(bond->dev, port->slave->dev, "Port %d did not find a suitable aggregator\n", port->actor_port_number); return; } } /* if all aggregator's ports are READY_N == TRUE, set ready=TRUE * in all aggregator's ports, else set ready=FALSE in all * aggregator's ports */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); aggregator = __get_first_agg(port); ad_agg_selection_logic(aggregator, update_slave_arr); if (!port->aggregator->is_active) port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; } /* Decide if "agg" is a better choice for the new active aggregator that * the current best, according to the ad_select policy. */ static struct aggregator *ad_agg_selection_test(struct aggregator *best, struct aggregator *curr) { /* 0. If no best, select current. * * 1. If the current agg is not individual, and the best is * individual, select current. * * 2. If current agg is individual and the best is not, keep best. * * 3. Therefore, current and best are both individual or both not * individual, so: * * 3a. If current agg partner replied, and best agg partner did not, * select current. * * 3b. If current agg partner did not reply and best agg partner * did reply, keep best. * * 4. Therefore, current and best both have partner replies or * both do not, so perform selection policy: * * BOND_AD_COUNT: Select by count of ports. If count is equal, * select by bandwidth. * * BOND_AD_STABLE, BOND_AD_BANDWIDTH: Select by bandwidth. */ if (!best) return curr; if (!curr->is_individual && best->is_individual) return curr; if (curr->is_individual && !best->is_individual) return best; if (__agg_has_partner(curr) && !__agg_has_partner(best)) return curr; if (!__agg_has_partner(curr) && __agg_has_partner(best)) return best; switch (__get_agg_selection_mode(curr->lag_ports)) { case BOND_AD_COUNT: if (__agg_active_ports(curr) > __agg_active_ports(best)) return curr; if (__agg_active_ports(curr) < __agg_active_ports(best)) return best; fallthrough; case BOND_AD_STABLE: case BOND_AD_BANDWIDTH: if (__get_agg_bandwidth(curr) > __get_agg_bandwidth(best)) return curr; break; default: net_warn_ratelimited("%s: (slave %s): Impossible agg select mode %d\n", curr->slave->bond->dev->name, curr->slave->dev->name, __get_agg_selection_mode(curr->lag_ports)); break; } return best; } static int agg_device_up(const struct aggregator *agg) { struct port *port = agg->lag_ports; if (!port) return 0; for (port = agg->lag_ports; port; port = port->next_port_in_aggregator) { if (netif_running(port->slave->dev) && netif_carrier_ok(port->slave->dev)) return 1; } return 0; } /** * ad_agg_selection_logic - select an aggregation group for a team * @agg: the aggregator we're looking at * @update_slave_arr: Does slave array need update? * * It is assumed that only one aggregator may be selected for a team. * * The logic of this function is to select the aggregator according to * the ad_select policy: * * BOND_AD_STABLE: select the aggregator with the most ports attached to * it, and to reselect the active aggregator only if the previous * aggregator has no more ports related to it. * * BOND_AD_BANDWIDTH: select the aggregator with the highest total * bandwidth, and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * BOND_AD_COUNT: select the aggregator with largest number of ports * (slaves), and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * FIXME: this function MUST be called with the first agg in the bond, or * __get_active_agg() won't work correctly. This function should be better * called with the bond itself, and retrieve the first agg from it. */ static void ad_agg_selection_logic(struct aggregator *agg, bool *update_slave_arr) { struct aggregator *best, *active, *origin; struct bonding *bond = agg->slave->bond; struct list_head *iter; struct slave *slave; struct port *port; rcu_read_lock(); origin = agg; active = __get_active_agg(agg); best = (active && agg_device_up(active)) ? active : NULL; bond_for_each_slave_rcu(bond, slave, iter) { agg = &(SLAVE_AD_INFO(slave)->aggregator); agg->is_active = 0; if (__agg_active_ports(agg) && agg_device_up(agg)) best = ad_agg_selection_test(best, agg); } if (best && __get_agg_selection_mode(best->lag_ports) == BOND_AD_STABLE) { /* For the STABLE policy, don't replace the old active * aggregator if it's still active (it has an answering * partner) or if both the best and active don't have an * answering partner. */ if (active && active->lag_ports && __agg_active_ports(active) && (__agg_has_partner(active) || (!__agg_has_partner(active) && !__agg_has_partner(best)))) { if (!(!active->actor_oper_aggregator_key && best->actor_oper_aggregator_key)) { best = NULL; active->is_active = 1; } } } if (best && (best == active)) { best = NULL; active->is_active = 1; } /* if there is new best aggregator, activate it */ if (best) { netdev_dbg(bond->dev, "(slave %s): best Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier, best->num_of_ports, best->actor_oper_aggregator_key, best->partner_oper_aggregator_key, best->is_individual, best->is_active); netdev_dbg(bond->dev, "(slave %s): best ports %p slave %p\n", best->slave ? best->slave->dev->name : "NULL", best->lag_ports, best->slave); bond_for_each_slave_rcu(bond, slave, iter) { agg = &(SLAVE_AD_INFO(slave)->aggregator); slave_dbg(bond->dev, slave->dev, "Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", agg->aggregator_identifier, agg->num_of_ports, agg->actor_oper_aggregator_key, agg->partner_oper_aggregator_key, agg->is_individual, agg->is_active); } /* check if any partner replies */ if (best->is_individual) net_warn_ratelimited("%s: Warning: No 802.3ad response from the link partner for any adapters in the bond\n", bond->dev->name); best->is_active = 1; netdev_dbg(bond->dev, "(slave %s): LAG %d chosen as the active LAG\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier); netdev_dbg(bond->dev, "(slave %s): Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier, best->num_of_ports, best->actor_oper_aggregator_key, best->partner_oper_aggregator_key, best->is_individual, best->is_active); /* disable the ports that were related to the former * active_aggregator */ if (active) { for (port = active->lag_ports; port; port = port->next_port_in_aggregator) { __disable_port(port); } } /* Slave array needs update. */ *update_slave_arr = true; } /* if the selected aggregator is of join individuals * (partner_system is NULL), enable their ports */ active = __get_active_agg(origin); if (active) { if (!__agg_has_partner(active)) { for (port = active->lag_ports; port; port = port->next_port_in_aggregator) { __enable_port(port); } *update_slave_arr = true; } } rcu_read_unlock(); bond_3ad_set_carrier(bond); } /** * ad_clear_agg - clear a given aggregator's parameters * @aggregator: the aggregator we're looking at */ static void ad_clear_agg(struct aggregator *aggregator) { if (aggregator) { aggregator->is_individual = false; aggregator->actor_admin_aggregator_key = 0; aggregator->actor_oper_aggregator_key = 0; eth_zero_addr(aggregator->partner_system.mac_addr_value); aggregator->partner_system_priority = 0; aggregator->partner_oper_aggregator_key = 0; aggregator->receive_state = 0; aggregator->transmit_state = 0; aggregator->lag_ports = NULL; aggregator->is_active = 0; aggregator->num_of_ports = 0; pr_debug("%s: LAG %d was cleared\n", aggregator->slave ? aggregator->slave->dev->name : "NULL", aggregator->aggregator_identifier); } } /** * ad_initialize_agg - initialize a given aggregator's parameters * @aggregator: the aggregator we're looking at */ static void ad_initialize_agg(struct aggregator *aggregator) { if (aggregator) { ad_clear_agg(aggregator); eth_zero_addr(aggregator->aggregator_mac_address.mac_addr_value); aggregator->aggregator_identifier = 0; aggregator->slave = NULL; } } /** * ad_initialize_port - initialize a given port's parameters * @port: the port we're looking at * @lacp_fast: boolean. whether fast periodic should be used */ static void ad_initialize_port(struct port *port, int lacp_fast) { static const struct port_params tmpl = { .system_priority = 0xffff, .key = 1, .port_number = 1, .port_priority = 0xff, .port_state = 1, }; static const struct lacpdu lacpdu = { .subtype = 0x01, .version_number = 0x01, .tlv_type_actor_info = 0x01, .actor_information_length = 0x14, .tlv_type_partner_info = 0x02, .partner_information_length = 0x14, .tlv_type_collector_info = 0x03, .collector_information_length = 0x10, .collector_max_delay = htons(AD_COLLECTOR_MAX_DELAY), }; if (port) { port->actor_port_priority = 0xff; port->actor_port_aggregator_identifier = 0; port->ntt = false; port->actor_admin_port_state = LACP_STATE_AGGREGATION | LACP_STATE_LACP_ACTIVITY; port->actor_oper_port_state = LACP_STATE_AGGREGATION | LACP_STATE_LACP_ACTIVITY; if (lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; memcpy(&port->partner_admin, &tmpl, sizeof(tmpl)); memcpy(&port->partner_oper, &tmpl, sizeof(tmpl)); port->is_enabled = true; /* private parameters */ port->sm_vars = AD_PORT_BEGIN | AD_PORT_LACP_ENABLED; port->sm_rx_state = 0; port->sm_rx_timer_counter = 0; port->sm_periodic_state = 0; port->sm_periodic_timer_counter = 0; port->sm_mux_state = 0; port->sm_mux_timer_counter = 0; port->sm_tx_state = 0; port->aggregator = NULL; port->next_port_in_aggregator = NULL; port->transaction_id = 0; port->sm_churn_actor_timer_counter = 0; port->sm_churn_actor_state = 0; port->churn_actor_count = 0; port->sm_churn_partner_timer_counter = 0; port->sm_churn_partner_state = 0; port->churn_partner_count = 0; memcpy(&port->lacpdu, &lacpdu, sizeof(lacpdu)); } } /** * ad_enable_collecting_distributing - enable a port's transmit/receive * @port: the port we're looking at * @update_slave_arr: Does slave array need update? * * Enable @port if it's in an active aggregator */ static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator->is_active) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Enabling port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __enable_port(port); /* Slave array needs update */ *update_slave_arr = true; } } /** * ad_disable_collecting_distributing - disable a port's transmit/receive * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator && !MAC_ADDRESS_EQUAL(&(port->aggregator->partner_system), &(null_mac_addr))) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __disable_port(port); /* Slave array needs an update */ *update_slave_arr = true; } } /** * ad_marker_info_received - handle receive of a Marker information frame * @marker_info: Marker info received * @port: the port we're looking at */ static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port) { struct bond_marker marker; atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.marker_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.marker_rx); /* copy the received marker data to the response marker */ memcpy(&marker, marker_info, sizeof(struct bond_marker)); /* change the marker subtype to marker response */ marker.tlv_type = AD_MARKER_RESPONSE_SUBTYPE; /* send the marker response */ if (ad_marker_send(port, &marker) >= 0) slave_dbg(port->slave->bond->dev, port->slave->dev, "Sent Marker Response on port %d\n", port->actor_port_number); } /** * ad_marker_response_received - handle receive of a marker response frame * @marker: marker PDU received * @port: the port we're looking at * * This function does nothing since we decided not to implement send and handle * response for marker PDU's, in this stage, but only to respond to marker * information. */ static void ad_marker_response_received(struct bond_marker *marker, struct port *port) { atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.marker_resp_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.marker_resp_rx); /* DO NOTHING, SINCE WE DECIDED NOT TO IMPLEMENT THIS FEATURE FOR NOW */ } /* ========= AD exported functions to the main bonding code ========= */ /* Check aggregators status in team every T seconds */ #define AD_AGGREGATOR_SELECTION_TIMER 8 /** * bond_3ad_initiate_agg_selection - initate aggregator selection * @bond: bonding struct * @timeout: timeout value to set * * Set the aggregation selection timer, to initiate an agg selection in * the very near future. Called during first initialization, and during * any down to up transitions of the bond. */ void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) { atomic_set(&BOND_AD_INFO(bond).agg_select_timer, timeout); } /** * bond_3ad_initialize - initialize a bond's 802.3ad parameters and structures * @bond: bonding struct to work on * * Can be called only after the mac address of the bond is set. */ void bond_3ad_initialize(struct bonding *bond) { BOND_AD_INFO(bond).aggregator_identifier = 0; BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio; if (is_zero_ether_addr(bond->params.ad_actor_system)) BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr); else BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->params.ad_actor_system); bond_3ad_initiate_agg_selection(bond, AD_AGGREGATOR_SELECTION_TIMER * ad_ticks_per_sec); } /** * bond_3ad_bind_slave - initialize a slave's port * @slave: slave struct to work on * * Returns: 0 on success * < 0 on error */ void bond_3ad_bind_slave(struct slave *slave) { struct bonding *bond = bond_get_bond_by_slave(slave); struct port *port; struct aggregator *aggregator; /* check that the slave has not been initialized yet. */ if (SLAVE_AD_INFO(slave)->port.slave != slave) { /* port initialization */ port = &(SLAVE_AD_INFO(slave)->port); ad_initialize_port(port, bond->params.lacp_fast); port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; /* key is determined according to the link speed, duplex and * user key */ port->actor_admin_port_key = bond->params.ad_user_port_key << 6; ad_update_actor_keys(port, false); /* actor system is the bond's system */ __ad_actor_update_port(port); /* tx timer(to verify that no more than MAX_TX_IN_SECOND * lacpdu's are sent in one second) */ port->sm_tx_timer_counter = ad_ticks_per_sec/AD_MAX_TX_IN_SECOND; __disable_port(port); /* aggregator initialization */ aggregator = &(SLAVE_AD_INFO(slave)->aggregator); ad_initialize_agg(aggregator); aggregator->aggregator_mac_address = *((struct mac_addr *)bond->dev->dev_addr); aggregator->aggregator_identifier = ++BOND_AD_INFO(bond).aggregator_identifier; aggregator->slave = slave; aggregator->is_active = 0; aggregator->num_of_ports = 0; } } /** * bond_3ad_unbind_slave - deinitialize a slave's port * @slave: slave struct to work on * * Search for the aggregator that is related to this port, remove the * aggregator and assign another aggregator for other port related to it * (if any), and remove the port. */ void bond_3ad_unbind_slave(struct slave *slave) { struct port *port, *prev_port, *temp_port; struct aggregator *aggregator, *new_aggregator, *temp_aggregator; int select_new_active_agg = 0; struct bonding *bond = slave->bond; struct slave *slave_iter; struct list_head *iter; bool dummy_slave_update; /* Ignore this value as caller updates array */ /* Sync against bond_3ad_state_machine_handler() */ spin_lock_bh(&bond->mode_lock); aggregator = &(SLAVE_AD_INFO(slave)->aggregator); port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(bond->dev, slave->dev, "Trying to unbind an uninitialized port\n"); goto out; } slave_dbg(bond->dev, slave->dev, "Unbinding Link Aggregation Group %d\n", aggregator->aggregator_identifier); /* Tell the partner that this port is not suitable for aggregation */ port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->actor_oper_port_state &= ~LACP_STATE_AGGREGATION; __update_lacpdu_from_port(port); ad_lacpdu_send(port); /* check if this aggregator is occupied */ if (aggregator->lag_ports) { /* check if there are other ports related to this aggregator * except the port related to this slave(thats ensure us that * there is a reason to search for new aggregator, and that we * will find one */ if ((aggregator->lag_ports != port) || (aggregator->lag_ports->next_port_in_aggregator)) { /* find new aggregator for the related port(s) */ bond_for_each_slave(bond, slave_iter, iter) { new_aggregator = &(SLAVE_AD_INFO(slave_iter)->aggregator); /* if the new aggregator is empty, or it is * connected to our port only */ if (!new_aggregator->lag_ports || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator)) break; } if (!slave_iter) new_aggregator = NULL; /* if new aggregator found, copy the aggregator's * parameters and connect the related lag_ports to the * new aggregator */ if ((new_aggregator) && ((!new_aggregator->lag_ports) || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator))) { slave_dbg(bond->dev, slave->dev, "Some port(s) related to LAG %d - replacing with LAG %d\n", aggregator->aggregator_identifier, new_aggregator->aggregator_identifier); if ((new_aggregator->lag_ports == port) && new_aggregator->is_active) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); select_new_active_agg = 1; } new_aggregator->is_individual = aggregator->is_individual; new_aggregator->actor_admin_aggregator_key = aggregator->actor_admin_aggregator_key; new_aggregator->actor_oper_aggregator_key = aggregator->actor_oper_aggregator_key; new_aggregator->partner_system = aggregator->partner_system; new_aggregator->partner_system_priority = aggregator->partner_system_priority; new_aggregator->partner_oper_aggregator_key = aggregator->partner_oper_aggregator_key; new_aggregator->receive_state = aggregator->receive_state; new_aggregator->transmit_state = aggregator->transmit_state; new_aggregator->lag_ports = aggregator->lag_ports; new_aggregator->is_active = aggregator->is_active; new_aggregator->num_of_ports = aggregator->num_of_ports; /* update the information that is written on * the ports about the aggregator */ for (temp_port = aggregator->lag_ports; temp_port; temp_port = temp_port->next_port_in_aggregator) { temp_port->aggregator = new_aggregator; temp_port->actor_port_aggregator_identifier = new_aggregator->aggregator_identifier; } ad_clear_agg(aggregator); if (select_new_active_agg) ad_agg_selection_logic(__get_first_agg(port), &dummy_slave_update); } else { slave_warn(bond->dev, slave->dev, "unbinding aggregator, and could not find a new aggregator for its ports\n"); } } else { /* in case that the only port related to this * aggregator is the one we want to remove */ select_new_active_agg = aggregator->is_active; ad_clear_agg(aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ temp_aggregator = __get_first_agg(port); if (temp_aggregator) ad_agg_selection_logic(temp_aggregator, &dummy_slave_update); } } } slave_dbg(bond->dev, slave->dev, "Unbinding port %d\n", port->actor_port_number); /* find the aggregator that this port is connected to */ bond_for_each_slave(bond, slave_iter, iter) { temp_aggregator = &(SLAVE_AD_INFO(slave_iter)->aggregator); prev_port = NULL; /* search the port in the aggregator's related ports */ for (temp_port = temp_aggregator->lag_ports; temp_port; prev_port = temp_port, temp_port = temp_port->next_port_in_aggregator) { if (temp_port == port) { /* the aggregator found - detach the port from * this aggregator */ if (prev_port) prev_port->next_port_in_aggregator = temp_port->next_port_in_aggregator; else temp_aggregator->lag_ports = temp_port->next_port_in_aggregator; temp_aggregator->num_of_ports--; if (__agg_active_ports(temp_aggregator) == 0) { select_new_active_agg = temp_aggregator->is_active; if (temp_aggregator->num_of_ports == 0) ad_clear_agg(temp_aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ ad_agg_selection_logic(__get_first_agg(port), &dummy_slave_update); } } break; } } } port->slave = NULL; out: spin_unlock_bh(&bond->mode_lock); } /** * bond_3ad_update_ad_actor_settings - reflect change of actor settings to ports * @bond: bonding struct to work on * * If an ad_actor setting gets changed we need to update the individual port * settings so the bond device will use the new values when it gets upped. */ void bond_3ad_update_ad_actor_settings(struct bonding *bond) { struct list_head *iter; struct slave *slave; ASSERT_RTNL(); BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio; if (is_zero_ether_addr(bond->params.ad_actor_system)) BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr); else BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->params.ad_actor_system); spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { struct port *port = &(SLAVE_AD_INFO(slave))->port; __ad_actor_update_port(port); port->ntt = true; } spin_unlock_bh(&bond->mode_lock); } /** * bond_agg_timer_advance - advance agg_select_timer * @bond: bonding structure * * Return true when agg_select_timer reaches 0. */ static bool bond_agg_timer_advance(struct bonding *bond) { int val, nval; while (1) { val = atomic_read(&BOND_AD_INFO(bond).agg_select_timer); if (!val) return false; nval = val - 1; if (atomic_cmpxchg(&BOND_AD_INFO(bond).agg_select_timer, val, nval) == val) break; } return nval == 0; } /** * bond_3ad_state_machine_handler - handle state machines timeout * @work: work context to fetch bonding struct to work on from * * The state machine handling concept in this module is to check every tick * which state machine should operate any function. The execution order is * round robin, so when we have an interaction between state machines, the * reply of one to each other might be delayed until next tick. * * This function also complete the initialization when the agg_select_timer * times out, and it selects an aggregator for the ports that are yet not * related to any aggregator, and selects the active aggregator for a bond. */ void bond_3ad_state_machine_handler(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, ad_work.work); struct aggregator *aggregator; struct list_head *iter; struct slave *slave; struct port *port; bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER; bool update_slave_arr = false; /* Lock to protect data accessed by all (e.g., port->sm_vars) and * against running with bond_3ad_unbind_slave. ad_rx_machine may run * concurrently due to incoming LACPDU as well. */ spin_lock_bh(&bond->mode_lock); rcu_read_lock(); /* check if there are any slaves */ if (!bond_has_slaves(bond)) goto re_arm; if (bond_agg_timer_advance(bond)) { slave = bond_first_slave_rcu(bond); port = slave ? &(SLAVE_AD_INFO(slave)->port) : NULL; /* select the active aggregator for the bond */ if (port) { if (!port->slave) { net_warn_ratelimited("%s: Warning: bond's first port is uninitialized\n", bond->dev->name); goto re_arm; } aggregator = __get_first_agg(port); ad_agg_selection_logic(aggregator, &update_slave_arr); } bond_3ad_set_carrier(bond); } /* for each port run the state machines */ bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (!port->slave) { net_warn_ratelimited("%s: Warning: Found an uninitialized port\n", bond->dev->name); goto re_arm; } ad_rx_machine(NULL, port); ad_periodic_machine(port, &bond->params); ad_port_selection_logic(port, &update_slave_arr); ad_mux_machine(port, &update_slave_arr); ad_tx_machine(port); ad_churn_machine(port); /* turn off the BEGIN bit, since we already handled it */ if (port->sm_vars & AD_PORT_BEGIN) port->sm_vars &= ~AD_PORT_BEGIN; } re_arm: bond_for_each_slave_rcu(bond, slave, iter) { if (slave->should_notify) { should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW; break; } } rcu_read_unlock(); spin_unlock_bh(&bond->mode_lock); if (update_slave_arr) bond_slave_arr_work_rearm(bond, 0); if (should_notify_rtnl && rtnl_trylock()) { bond_slave_state_notify(bond); rtnl_unlock(); } queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks); } /** * bond_3ad_rx_indication - handle a received frame * @lacpdu: received lacpdu * @slave: slave struct to work on * * It is assumed that frames that were sent on this NIC don't returned as new * received frames (loopback). Since only the payload is given to this * function, it check for loopback. */ static int bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave) { struct bonding *bond = slave->bond; int ret = RX_HANDLER_ANOTHER; struct bond_marker *marker; struct port *port; atomic64_t *stat; port = &(SLAVE_AD_INFO(slave)->port); if (!port->slave) { net_warn_ratelimited("%s: Warning: port of slave %s is uninitialized\n", slave->dev->name, slave->bond->dev->name); return ret; } switch (lacpdu->subtype) { case AD_TYPE_LACPDU: ret = RX_HANDLER_CONSUMED; slave_dbg(slave->bond->dev, slave->dev, "Received LACPDU on port %d\n", port->actor_port_number); /* Protect against concurrent state machines */ spin_lock(&slave->bond->mode_lock); ad_rx_machine(lacpdu, port); spin_unlock(&slave->bond->mode_lock); break; case AD_TYPE_MARKER: ret = RX_HANDLER_CONSUMED; /* No need to convert fields to Little Endian since we * don't use the marker's fields. */ marker = (struct bond_marker *)lacpdu; switch (marker->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: slave_dbg(slave->bond->dev, slave->dev, "Received Marker Information on port %d\n", port->actor_port_number); ad_marker_info_received(marker, port); break; case AD_MARKER_RESPONSE_SUBTYPE: slave_dbg(slave->bond->dev, slave->dev, "Received Marker Response on port %d\n", port->actor_port_number); ad_marker_response_received(marker, port); break; default: slave_dbg(slave->bond->dev, slave->dev, "Received an unknown Marker subtype on port %d\n", port->actor_port_number); stat = &SLAVE_AD_INFO(slave)->stats.marker_unknown_rx; atomic64_inc(stat); stat = &BOND_AD_INFO(bond).stats.marker_unknown_rx; atomic64_inc(stat); } break; default: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_unknown_rx); atomic64_inc(&BOND_AD_INFO(bond).stats.lacpdu_unknown_rx); } return ret; } /** * ad_update_actor_keys - Update the oper / admin keys for a port based on * its current speed and duplex settings. * * @port: the port we'are looking at * @reset: Boolean to just reset the speed and the duplex part of the key * * The logic to change the oper / admin keys is: * (a) A full duplex port can participate in LACP with partner. * (b) When the speed is changed, LACP need to be reinitiated. */ static void ad_update_actor_keys(struct port *port, bool reset) { u8 duplex = 0; u16 ospeed = 0, speed = 0; u16 old_oper_key = port->actor_oper_port_key; port->actor_admin_port_key &= ~(AD_SPEED_KEY_MASKS|AD_DUPLEX_KEY_MASKS); if (!reset) { speed = __get_link_speed(port); ospeed = (old_oper_key & AD_SPEED_KEY_MASKS) >> 1; duplex = __get_duplex(port); port->actor_admin_port_key |= (speed << 1) | duplex; } port->actor_oper_port_key = port->actor_admin_port_key; if (old_oper_key != port->actor_oper_port_key) { /* Only 'duplex' port participates in LACP */ if (duplex) port->sm_vars |= AD_PORT_LACP_ENABLED; else port->sm_vars &= ~AD_PORT_LACP_ENABLED; if (!reset) { if (!speed) { slave_err(port->slave->bond->dev, port->slave->dev, "speed changed to 0 on port %d\n", port->actor_port_number); } else if (duplex && ospeed != speed) { /* Speed change restarts LACP state-machine */ port->sm_vars |= AD_PORT_BEGIN; } } } } /** * bond_3ad_adapter_speed_duplex_changed - handle a slave's speed / duplex * change indication * * @slave: slave struct to work on * * Handle reselection of aggregator (if needed) for this port. */ void bond_3ad_adapter_speed_duplex_changed(struct slave *slave) { struct port *port; port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(slave->bond->dev, slave->dev, "speed/duplex changed for uninitialized port\n"); return; } spin_lock_bh(&slave->bond->mode_lock); ad_update_actor_keys(port, false); spin_unlock_bh(&slave->bond->mode_lock); slave_dbg(slave->bond->dev, slave->dev, "Port %d changed speed/duplex\n", port->actor_port_number); } /** * bond_3ad_handle_link_change - handle a slave's link status change indication * @slave: slave struct to work on * @link: whether the link is now up or down * * Handle reselection of aggregator (if needed) for this port. */ void bond_3ad_handle_link_change(struct slave *slave, char link) { struct aggregator *agg; struct port *port; bool dummy; port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(slave->bond->dev, slave->dev, "link status changed for uninitialized port\n"); return; } spin_lock_bh(&slave->bond->mode_lock); /* on link down we are zeroing duplex and speed since * some of the adaptors(ce1000.lan) report full duplex/speed * instead of N/A(duplex) / 0(speed). * * on link up we are forcing recheck on the duplex and speed since * some of he adaptors(ce1000.lan) report. */ if (link == BOND_LINK_UP) { port->is_enabled = true; ad_update_actor_keys(port, false); } else { /* link has failed */ port->is_enabled = false; ad_update_actor_keys(port, true); } agg = __get_first_agg(port); ad_agg_selection_logic(agg, &dummy); spin_unlock_bh(&slave->bond->mode_lock); slave_dbg(slave->bond->dev, slave->dev, "Port %d changed link status to %s\n", port->actor_port_number, link == BOND_LINK_UP ? "UP" : "DOWN"); /* RTNL is held and mode_lock is released so it's safe * to update slave_array here. */ bond_update_slave_arr(slave->bond, NULL); } /** * bond_3ad_set_carrier - set link state for bonding master * @bond: bonding structure * * if we have an active aggregator, we're up, if not, we're down. * Presumes that we cannot have an active aggregator if there are * no slaves with link up. * * This behavior complies with IEEE 802.3 section 43.3.9. * * Called by bond_set_carrier(). Return zero if carrier state does not * change, nonzero if it does. */ int bond_3ad_set_carrier(struct bonding *bond) { struct aggregator *active; struct slave *first_slave; int ret = 1; rcu_read_lock(); first_slave = bond_first_slave_rcu(bond); if (!first_slave) { ret = 0; goto out; } active = __get_active_agg(&(SLAVE_AD_INFO(first_slave)->aggregator)); if (active) { /* are enough slaves available to consider link up? */ if (__agg_active_ports(active) < bond->params.min_links) { if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); goto out; } } else if (!netif_carrier_ok(bond->dev)) { netif_carrier_on(bond->dev); goto out; } } else if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); } out: rcu_read_unlock(); return ret; } /** * __bond_3ad_get_active_agg_info - get information of the active aggregator * @bond: bonding struct to work on * @ad_info: ad_info struct to fill with the bond's info * * Returns: 0 on success * < 0 on error */ int __bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { struct aggregator *aggregator = NULL; struct list_head *iter; struct slave *slave; struct port *port; bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (port->aggregator && port->aggregator->is_active) { aggregator = port->aggregator; break; } } if (!aggregator) return -1; ad_info->aggregator_id = aggregator->aggregator_identifier; ad_info->ports = __agg_active_ports(aggregator); ad_info->actor_key = aggregator->actor_oper_aggregator_key; ad_info->partner_key = aggregator->partner_oper_aggregator_key; ether_addr_copy(ad_info->partner_system, aggregator->partner_system.mac_addr_value); return 0; } int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { int ret; rcu_read_lock(); ret = __bond_3ad_get_active_agg_info(bond, ad_info); rcu_read_unlock(); return ret; } int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct lacpdu *lacpdu, _lacpdu; if (skb->protocol != PKT_TYPE_LACPDU) return RX_HANDLER_ANOTHER; if (!MAC_ADDRESS_EQUAL(eth_hdr(skb)->h_dest, lacpdu_mcast_addr)) return RX_HANDLER_ANOTHER; lacpdu = skb_header_pointer(skb, 0, sizeof(_lacpdu), &_lacpdu); if (!lacpdu) { atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_illegal_rx); atomic64_inc(&BOND_AD_INFO(bond).stats.lacpdu_illegal_rx); return RX_HANDLER_ANOTHER; } return bond_3ad_rx_indication(lacpdu, slave); } /** * bond_3ad_update_lacp_rate - change the lacp rate * @bond: bonding struct * * When modify lacp_rate parameter via sysfs, * update actor_oper_port_state of each port. * * Hold bond->mode_lock, * so we can modify port->actor_oper_port_state, * no matter bond is up or down. */ void bond_3ad_update_lacp_rate(struct bonding *bond) { struct port *port = NULL; struct list_head *iter; struct slave *slave; int lacp_fast; lacp_fast = bond->params.lacp_fast; spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; else port->actor_oper_port_state &= ~LACP_STATE_LACP_TIMEOUT; } spin_unlock_bh(&bond->mode_lock); } size_t bond_3ad_stats_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_TX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_UNKNOWN_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_ILLEGAL_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_TX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_TX */ nla_total_size_64bit(sizeof(u64)); /* BOND_3AD_STAT_MARKER_UNKNOWN_RX */ } int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats) { u64 val; val = atomic64_read(&stats->lacpdu_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_unknown_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_UNKNOWN_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_illegal_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_ILLEGAL_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_resp_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_resp_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_unknown_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_UNKNOWN_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; return 0; }
1836 1837 1 1 17 1 8 6 14 2 15 2 13 438 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 // SPDX-License-Identifier: GPL-2.0-or-later /* * "TEE" target extension for Xtables * Copyright © Sebastian Claßen, 2007 * Jan Engelhardt, 2007-2010 * * based on ipt_ROUTE.c from Cédric de Launois * <delaunois@info.ucl.be> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/route.h> #include <linux/netfilter/x_tables.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #include <net/netfilter/ipv6/nf_dup_ipv6.h> #include <linux/netfilter/xt_TEE.h> struct xt_tee_priv { struct list_head list; struct xt_tee_tginfo *tginfo; int oif; }; static unsigned int tee_net_id __read_mostly; static const union nf_inet_addr tee_zero_address; struct tee_net { struct list_head priv_list; /* lock protects the priv_list */ struct mutex lock; }; static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif); return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; int oif = info->priv ? info->priv->oif : 0; nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif); return XT_CONTINUE; } #endif static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct tee_net *tn = net_generic(net, tee_net_id); struct xt_tee_priv *priv; mutex_lock(&tn->lock); list_for_each_entry(priv, &tn->priv_list, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; break; case NETDEV_UNREGISTER: if (dev->ifindex == priv->oif) priv->oif = -1; break; case NETDEV_CHANGENAME: if (!strcmp(dev->name, priv->tginfo->oif)) priv->oif = dev->ifindex; else if (dev->ifindex == priv->oif) priv->oif = -1; break; } } mutex_unlock(&tn->lock); return NOTIFY_DONE; } static int tee_tg_check(const struct xt_tgchk_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; struct xt_tee_priv *priv; /* 0.0.0.0 and :: not allowed */ if (memcmp(&info->gw, &tee_zero_address, sizeof(tee_zero_address)) == 0) return -EINVAL; if (info->oif[0]) { struct net_device *dev; if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv == NULL) return -ENOMEM; priv->tginfo = info; priv->oif = -1; info->priv = priv; dev = dev_get_by_name(par->net, info->oif); if (dev) { priv->oif = dev->ifindex; dev_put(dev); } mutex_lock(&tn->lock); list_add(&priv->list, &tn->priv_list); mutex_unlock(&tn->lock); } else info->priv = NULL; static_key_slow_inc(&xt_tee_enabled); return 0; } static void tee_tg_destroy(const struct xt_tgdtor_param *par) { struct tee_net *tn = net_generic(par->net, tee_net_id); struct xt_tee_tginfo *info = par->targinfo; if (info->priv) { mutex_lock(&tn->lock); list_del(&info->priv->list); mutex_unlock(&tn->lock); kfree(info->priv); } static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { { .name = "TEE", .revision = 1, .family = NFPROTO_IPV4, .target = tee_tg4, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TEE", .revision = 1, .family = NFPROTO_IPV6, .target = tee_tg6, .targetsize = sizeof(struct xt_tee_tginfo), .usersize = offsetof(struct xt_tee_tginfo, priv), .checkentry = tee_tg_check, .destroy = tee_tg_destroy, .me = THIS_MODULE, }, #endif }; static int __net_init tee_net_init(struct net *net) { struct tee_net *tn = net_generic(net, tee_net_id); INIT_LIST_HEAD(&tn->priv_list); mutex_init(&tn->lock); return 0; } static struct pernet_operations tee_net_ops = { .init = tee_net_init, .id = &tee_net_id, .size = sizeof(struct tee_net), }; static struct notifier_block tee_netdev_notifier = { .notifier_call = tee_netdev_event, }; static int __init tee_tg_init(void) { int ret; ret = register_pernet_subsys(&tee_net_ops); if (ret < 0) return ret; ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); if (ret < 0) goto cleanup_subsys; ret = register_netdevice_notifier(&tee_netdev_notifier); if (ret < 0) goto unregister_targets; return 0; unregister_targets: xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); cleanup_subsys: unregister_pernet_subsys(&tee_net_ops); return ret; } static void __exit tee_tg_exit(void) { unregister_netdevice_notifier(&tee_netdev_notifier); xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); unregister_pernet_subsys(&tee_net_ops); } module_init(tee_tg_init); module_exit(tee_tg_exit); MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>"); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_DESCRIPTION("Xtables: Reroute packet copy"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TEE"); MODULE_ALIAS("ip6t_TEE");
22 22 55 747 1224 1224 764 765 2 55 746 461 461 3 3 724 724 138 139 139 812 812 185 6 183 184 352 164 260 352 352 373 373 177 796 790 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 // SPDX-License-Identifier: GPL-2.0-only #include <linux/stat.h> #include <linux/sysctl.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/hash.h> #include <linux/kmemleak.h> #include <linux/user_namespace.h> struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, .count = ATOMIC_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ UCOUNTS_HASHTABLE_BITS) #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) { return &current_user_ns()->set; } static int set_is_seen(struct ctl_table_set *set) { return &current_user_ns()->set == set; } static int set_permissions(struct ctl_table_header *head, struct ctl_table *table) { struct user_namespace *user_ns = container_of(head->set, struct user_namespace, set); int mode; /* Allow users with CAP_SYS_RESOURCE unrestrained access */ if (ns_capable(user_ns, CAP_SYS_RESOURCE)) mode = (table->mode & S_IRWXU) >> 6; else /* Allow all others at most read-only access */ mode = table->mode & S_IROTH; return (mode << 6) | (mode << 3) | mode; } static struct ctl_table_root set_root = { .lookup = set_lookup, .permissions = set_permissions, }; static long ue_zero = 0; static long ue_int_max = INT_MAX; #define UCOUNT_ENTRY(name) \ { \ .procname = name, \ .maxlen = sizeof(long), \ .mode = 0644, \ .proc_handler = proc_doulongvec_minmax, \ .extra1 = &ue_zero, \ .extra2 = &ue_int_max, \ } static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_user_namespaces"), UCOUNT_ENTRY("max_pid_namespaces"), UCOUNT_ENTRY("max_uts_namespaces"), UCOUNT_ENTRY("max_ipc_namespaces"), UCOUNT_ENTRY("max_net_namespaces"), UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), UCOUNT_ENTRY("max_time_namespaces"), #ifdef CONFIG_INOTIFY_USER UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif #ifdef CONFIG_FANOTIFY UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif { } }; #endif /* CONFIG_SYSCTL */ bool setup_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL struct ctl_table *tbl; BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1); setup_sysctl_set(&ns->set, &set_root, set_is_seen); tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); if (tbl) { int i; for (i = 0; i < UCOUNT_COUNTS; i++) { tbl[i].data = &ns->ucount_max[i]; } ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl, ARRAY_SIZE(user_table)); } if (!ns->sysctls) { kfree(tbl); retire_sysctl_set(&ns->set); return false; } #endif return true; } void retire_userns_sysctls(struct user_namespace *ns) { #ifdef CONFIG_SYSCTL struct ctl_table *tbl; tbl = ns->sysctls->ctl_table_arg; unregister_sysctl_table(ns->sysctls); retire_sysctl_set(&ns->set); kfree(tbl); #endif } static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) { struct ucounts *ucounts; hlist_for_each_entry(ucounts, hashent, node) { if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) return ucounts; } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); spin_lock_irq(&ucounts_lock); hlist_add_head(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) { /* Returns true on a successful get, false if the count wraps. */ return !atomic_add_negative(1, &ucounts->count); } struct ucounts *get_ucounts(struct ucounts *ucounts) { if (!get_ucounts_or_wrap(ucounts)) { put_ucounts(ucounts); ucounts = NULL; } return ucounts; } struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; bool wrapped; spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (!ucounts) { spin_unlock_irq(&ucounts_lock); new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; new->ns = ns; new->uid = uid; atomic_set(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); if (ucounts) { kfree(new); } else { hlist_add_head(&new->node, hashent); get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } } wrapped = !get_ucounts_or_wrap(ucounts); spin_unlock_irq(&ucounts_lock); if (wrapped) { put_ucounts(ucounts); return NULL; } return ucounts; } void put_ucounts(struct ucounts *ucounts) { unsigned long flags; if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { hlist_del_init(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); put_user_ns(ucounts->ns); kfree(ucounts); } } static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { long c, old; c = atomic_long_read(v); for (;;) { if (unlikely(c >= u)) return false; old = atomic_long_cmpxchg(v, c, c+1); if (likely(old == c)) return true; c = old; } } struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type) { struct ucounts *ucounts, *iter, *bad; struct user_namespace *tns; ucounts = alloc_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { long max; tns = iter->ns; max = READ_ONCE(tns->ucount_max[type]); if (!atomic_long_inc_below(&iter->ucount[type], max)) goto fail; } return ucounts; fail: bad = iter; for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) atomic_long_dec(&iter->ucount[type]); put_ucounts(ucounts); return NULL; } void dec_ucount(struct ucounts *ucounts, enum ucount_type type) { struct ucounts *iter; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long dec = atomic_long_dec_if_positive(&iter->ucount[type]); WARN_ON_ONCE(dec < 0); } put_ucounts(ucounts); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long max = LONG_MAX; long ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(v, &iter->rlimit[type]); if (new < 0 || new > max) ret = LONG_MAX; else if (iter == ucounts) ret = new; max = get_userns_rlimit_max(iter->ns, type); } return ret; } bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long new = -1; /* Silence compiler warning */ for (iter = ucounts; iter; iter = iter->ns->ucounts) { long dec = atomic_long_sub_return(v, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); if (iter == ucounts) new = dec; } return (new == 0); } static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, struct ucounts *last, enum rlimit_type type) { struct ucounts *iter, *next; for (iter = ucounts; iter != last; iter = next) { long dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); next = iter->ns->ucounts; if (dec == 0) put_ucounts(iter); } } void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) { do_dec_rlimit_put_ucounts(ucounts, NULL, type); } long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; long max = LONG_MAX; long dec, ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(1, &iter->rlimit[type]); if (new < 0 || new > max) goto unwind; if (iter == ucounts) ret = new; max = get_userns_rlimit_max(iter->ns, type); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. */ if (new != 1) continue; if (!get_ucounts(iter)) goto dec_unwind; } return ret; dec_unwind: dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); unwind: do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) { struct ucounts *iter; long max = rlimit; if (rlimit > LONG_MAX) max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { long val = get_rlimit_value(iter, type); if (val < 0 || val > max) return true; max = get_userns_rlimit_max(iter->ns, type); } return false; } static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL static struct ctl_table_header *user_header; static struct ctl_table empty[1]; /* * It is necessary to register the user directory in the * default set so that registrations in the child sets work * properly. */ user_header = register_sysctl_sz("user", empty, 0); kmemleak_ignore(user_header); BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif hlist_add_ucounts(&init_ucounts); inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1); return 0; } subsys_initcall(user_namespace_sysctl_init);
1 1 1 1 1 3 3 1 2 1 1 4 3 1 3 1 1 3 4 4 4 4 4 1 3 2 1 3 23 1 18 4 18 4 20 2 17 5 19 3 17 5 17 5 18 4 22 19 4 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 // SPDX-License-Identifier: GPL-2.0-only /* * This file contains vfs inode ops for the 9P2000.L protocol. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" #include "cache.h" #include "xattr.h" #include "acl.h" static int v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** * v9fs_get_fsgid_for_create - Helper function to get the gid for a new object * @dir_inode: The directory inode * * Helper function to get the gid for creating a * new file system object. This checks the S_ISGID to determine the owning * group of the new file system object. */ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode) { BUG_ON(dir_inode == NULL); if (dir_inode->i_mode & S_ISGID) { /* set_gid bit is set.*/ return dir_inode->i_gid; } return current_fsgid(); } static int v9fs_test_inode_dotl(struct inode *inode, void *data) { struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; /* don't match inode of different type */ if (inode_wrong_type(inode, st->st_mode)) return 0; if (inode->i_generation != st->st_gen) return 0; /* compare qid details */ if (memcmp(&v9inode->qid.version, &st->qid.version, sizeof(v9inode->qid.version))) return 0; if (v9inode->qid.type != st->qid.type) return 0; if (v9inode->qid.path != st->qid.path) return 0; return 1; } /* Always get a new inode */ static int v9fs_test_new_inode_dotl(struct inode *inode, void *data) { return 0; } static int v9fs_set_inode_dotl(struct inode *inode, void *data) { struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); inode->i_generation = st->st_gen; return 0; } static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, struct p9_qid *qid, struct p9_fid *fid, struct p9_stat_dotl *st, int new) { int retval; unsigned long i_ino; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; int (*test)(struct inode *inode, void *data); if (new) test = v9fs_test_new_inode_dotl; else test = v9fs_test_inode_dotl; i_ino = v9fs_qid2ino(qid); inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; /* * initialize the inode with the stat info * FIXME!! we may need support for stale inodes * later. */ inode->i_ino = i_ino; retval = v9fs_init_inode(v9ses, inode, st->st_mode, new_decode_dev(st->st_rdev)); if (retval) goto error; v9fs_stat2inode_dotl(st, inode, 0); v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) goto error; unlock_new_inode(inode); return inode; error: iget_failed(inode); return ERR_PTR(retval); } struct inode * v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new) { struct p9_stat_dotl *st; struct inode *inode = NULL; st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); if (IS_ERR(st)) return ERR_CAST(st); inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new); kfree(st); return inode; } struct dotl_openflag_map { int open_flag; int dotl_flag; }; static int v9fs_mapped_dotl_flags(int flags) { int i; int rflags = 0; struct dotl_openflag_map dotl_oflag_map[] = { { O_CREAT, P9_DOTL_CREATE }, { O_EXCL, P9_DOTL_EXCL }, { O_NOCTTY, P9_DOTL_NOCTTY }, { O_APPEND, P9_DOTL_APPEND }, { O_NONBLOCK, P9_DOTL_NONBLOCK }, { O_DSYNC, P9_DOTL_DSYNC }, { FASYNC, P9_DOTL_FASYNC }, { O_DIRECT, P9_DOTL_DIRECT }, { O_LARGEFILE, P9_DOTL_LARGEFILE }, { O_DIRECTORY, P9_DOTL_DIRECTORY }, { O_NOFOLLOW, P9_DOTL_NOFOLLOW }, { O_NOATIME, P9_DOTL_NOATIME }, { O_CLOEXEC, P9_DOTL_CLOEXEC }, { O_SYNC, P9_DOTL_SYNC}, }; for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) { if (flags & dotl_oflag_map[i].open_flag) rflags |= dotl_oflag_map[i].dotl_flag; } return rflags; } /** * v9fs_open_to_dotl_flags- convert Linux specific open flags to * plan 9 open flag. * @flags: flags to convert */ int v9fs_open_to_dotl_flags(int flags) { int rflags = 0; /* * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY * and P9_DOTL_NOACCESS */ rflags |= flags & O_ACCMODE; rflags |= v9fs_mapped_dotl_flags(flags); return rflags; } /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. * @idmap: The user namespace of the mount * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @omode: create permissions * @excl: True if the file must not yet exist * */ static int v9fs_vfs_create_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, bool excl) { return v9fs_vfs_mknod_dotl(idmap, dir, dentry, omode, 0); } static int v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int flags, umode_t omode) { int err = 0; kgid_t gid; umode_t mode; int p9_omode = v9fs_open_to_dotl_flags(flags); const unsigned char *name = NULL; struct p9_qid qid; struct inode *inode; struct p9_fid *fid = NULL; struct p9_fid *dfid = NULL, *ofid = NULL; struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; struct dentry *res = NULL; if (d_in_lookup(dentry)) { res = v9fs_vfs_lookup(dir, dentry, 0); if (IS_ERR(res)) return PTR_ERR(res); if (res) dentry = res; } /* Only creates */ if (!(flags & O_CREAT) || d_really_is_positive(dentry)) return finish_no_open(file, res); v9ses = v9fs_inode2v9ses(dir); name = dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%x\n", name, flags, omode); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); goto out; } /* clone a fid to use for creation */ ofid = clone_fid(dfid); if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto out; } gid = v9fs_get_fsgid_for_create(dir); mode = omode; /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in create %d\n", err); goto out; } if ((v9ses->cache & CACHE_WRITEBACK) && (p9_omode & P9_OWRITE)) { p9_omode = (p9_omode & ~P9_OWRITE) | P9_ORDWR; p9_debug(P9_DEBUG_CACHE, "write-only file with writeback enabled, creating w/ O_RDWR\n"); } err = p9_client_create_dotl(ofid, name, p9_omode, mode, gid, &qid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in create %d\n", err); goto out; } v9fs_invalidate_inode_attr(dir); /* instantiate inode and assign the unopened fid to the dentry */ fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto out; } inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto out; } /* Now set the ACL based on the default value */ v9fs_set_create_acl(inode, fid, dacl, pacl); v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); /* Since we are opening a file, assign the open fid to the file */ err = finish_open(file, dentry, generic_file_open); if (err) goto out; file->private_data = ofid; #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) { struct v9fs_inode *v9inode = V9FS_I(inode); fscache_use_cookie(v9fs_inode_cookie(v9inode), file->f_mode & FMODE_WRITE); } #endif v9fs_fid_add_modes(ofid, v9ses->flags, v9ses->cache, flags); v9fs_open_fid_add(inode, &ofid); file->f_mode |= FMODE_CREATED; out: p9_fid_put(dfid); p9_fid_put(ofid); p9_fid_put(fid); v9fs_put_acl(dacl, pacl); dput(res); return err; } /** * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory * @idmap: The idmap of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @omode: mode for new directory * */ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode) { int err; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; kgid_t gid; const unsigned char *name; umode_t mode; struct inode *inode; struct p9_qid qid; struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); v9ses = v9fs_inode2v9ses(dir); omode |= S_IFDIR; if (dir->i_mode & S_ISGID) omode |= S_ISGID; dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); goto error; } gid = v9fs_get_fsgid_for_create(dir); mode = omode; /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n", err); goto error; } name = dentry->d_name.name; err = p9_client_mkdir_dotl(dfid, name, mode, gid, &qid); if (err < 0) goto error; fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto error; } /* instantiate inode and assign the unopened fid to the dentry */ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } v9fs_fid_add(dentry, &fid); v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); err = 0; } else { /* * Not in cached mode. No need to populate * inode with stat. We need to get an inode * so that we can set the acl with dentry */ inode = v9fs_get_inode(dir->i_sb, mode, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; } v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); } inc_nlink(dir); v9fs_invalidate_inode_attr(dir); error: p9_fid_put(fid); v9fs_put_acl(dacl, pacl); p9_fid_put(dfid); return err; } static int v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; struct p9_fid *fid; struct inode *inode = d_inode(dentry); struct p9_stat_dotl *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } else if (v9ses->cache) { if (S_ISREG(inode->i_mode)) { int retval = filemap_fdatawrite(inode->i_mapping); if (retval) p9_debug(P9_DEBUG_ERROR, "flushing writeback during getattr returned %d\n", retval); } } fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); /* Ask for all the fields in stat structure. Server will return * whatever it supports */ st = p9_client_getattr_dotl(fid, P9_STATS_ALL); p9_fid_put(fid); if (IS_ERR(st)) return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; kfree(st); return 0; } /* * Attribute flags. */ #define P9_ATTR_MODE (1 << 0) #define P9_ATTR_UID (1 << 1) #define P9_ATTR_GID (1 << 2) #define P9_ATTR_SIZE (1 << 3) #define P9_ATTR_ATIME (1 << 4) #define P9_ATTR_MTIME (1 << 5) #define P9_ATTR_CTIME (1 << 6) #define P9_ATTR_ATIME_SET (1 << 7) #define P9_ATTR_MTIME_SET (1 << 8) struct dotl_iattr_map { int iattr_valid; int p9_iattr_valid; }; static int v9fs_mapped_iattr_valid(int iattr_valid) { int i; int p9_iattr_valid = 0; struct dotl_iattr_map dotl_iattr_map[] = { { ATTR_MODE, P9_ATTR_MODE }, { ATTR_UID, P9_ATTR_UID }, { ATTR_GID, P9_ATTR_GID }, { ATTR_SIZE, P9_ATTR_SIZE }, { ATTR_ATIME, P9_ATTR_ATIME }, { ATTR_MTIME, P9_ATTR_MTIME }, { ATTR_CTIME, P9_ATTR_CTIME }, { ATTR_ATIME_SET, P9_ATTR_ATIME_SET }, { ATTR_MTIME_SET, P9_ATTR_MTIME_SET }, }; for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) { if (iattr_valid & dotl_iattr_map[i].iattr_valid) p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid; } return p9_iattr_valid; } /** * v9fs_vfs_setattr_dotl - set file metadata * @idmap: idmap of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * */ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; struct inode *inode = d_inode(dentry); struct v9fs_session_info __maybe_unused *v9ses; struct p9_fid *fid = NULL; struct p9_iattr_dotl p9attr = { .uid = INVALID_UID, .gid = INVALID_GID, }; p9_debug(P9_DEBUG_VFS, "\n"); retval = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (retval) return retval; v9ses = v9fs_dentry2v9ses(dentry); p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); if (iattr->ia_valid & ATTR_MODE) p9attr.mode = iattr->ia_mode; if (iattr->ia_valid & ATTR_UID) p9attr.uid = iattr->ia_uid; if (iattr->ia_valid & ATTR_GID) p9attr.gid = iattr->ia_gid; if (iattr->ia_valid & ATTR_SIZE) p9attr.size = iattr->ia_size; if (iattr->ia_valid & ATTR_ATIME_SET) { p9attr.atime_sec = iattr->ia_atime.tv_sec; p9attr.atime_nsec = iattr->ia_atime.tv_nsec; } if (iattr->ia_valid & ATTR_MTIME_SET) { p9attr.mtime_sec = iattr->ia_mtime.tv_sec; p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; } if (iattr->ia_valid & ATTR_FILE) { fid = iattr->ia_file->private_data; WARN_ON(!fid); } if (!fid) { fid = v9fs_fid_lookup(dentry); use_dentry = 1; } if (IS_ERR(fid)) return PTR_ERR(fid); /* Write all dirty data */ if (S_ISREG(inode->i_mode)) { retval = filemap_fdatawrite(inode->i_mapping); if (retval < 0) p9_debug(P9_DEBUG_ERROR, "Flushing file prior to setattr failed: %d\n", retval); } retval = p9_client_setattr(fid, &p9attr); if (retval < 0) { if (use_dentry) p9_fid_put(fid); return retval; } if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size != i_size_read(inode)) { truncate_setsize(inode, iattr->ia_size); netfs_resize_file(netfs_inode(inode), iattr->ia_size, true); #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) fscache_resize_cookie(v9fs_inode_cookie(V9FS_I(inode)), iattr->ia_size); #endif } v9fs_invalidate_inode_attr(inode); setattr_copy(&nop_mnt_idmap, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { /* We also want to update ACL when we update mode bits */ retval = v9fs_acl_chmod(inode, fid); if (retval < 0) { if (use_dentry) p9_fid_put(fid); return retval; } } if (use_dentry) p9_fid_put(fid); return 0; } /** * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, unsigned int flags) { umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { inode_set_atime(inode, stat->st_atime_sec, stat->st_atime_nsec); inode_set_mtime(inode, stat->st_mtime_sec, stat->st_mtime_nsec); inode_set_ctime(inode, stat->st_ctime_sec, stat->st_ctime_nsec); inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; set_nlink(inode, stat->st_nlink); mode = stat->st_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; v9inode->netfs.remote_i_size = stat->st_size; if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { inode_set_atime(inode, stat->st_atime_sec, stat->st_atime_nsec); } if (stat->st_result_mask & P9_STATS_MTIME) { inode_set_mtime(inode, stat->st_mtime_sec, stat->st_mtime_nsec); } if (stat->st_result_mask & P9_STATS_CTIME) { inode_set_ctime(inode, stat->st_ctime_sec, stat->st_ctime_nsec); } if (stat->st_result_mask & P9_STATS_UID) inode->i_uid = stat->st_uid; if (stat->st_result_mask & P9_STATS_GID) inode->i_gid = stat->st_gid; if (stat->st_result_mask & P9_STATS_NLINK) set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { mode = stat->st_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; } if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && stat->st_result_mask & P9_STATS_SIZE) { v9inode->netfs.remote_i_size = stat->st_size; v9fs_i_size_write(inode, stat->st_size); } if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } if (stat->st_result_mask & P9_STATS_GEN) inode->i_generation = stat->st_gen; /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION * because the inode structure does not have fields for them. */ v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } static int v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err; kgid_t gid; const unsigned char *name; struct p9_qid qid; struct inode *inode; struct p9_fid *dfid; struct p9_fid *fid = NULL; struct v9fs_session_info *v9ses; name = dentry->d_name.name; p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); return err; } gid = v9fs_get_fsgid_for_create(dir); /* Server doesn't alter fid on TSYMLINK. Hence no need to clone it. */ err = p9_client_symlink(dfid, name, symname, gid, &qid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); goto error; } v9fs_invalidate_inode_attr(dir); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { /* Now walk from the parent so we can get an unopened fid. */ fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto error; } /* instantiate inode and assign the unopened fid to dentry */ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); err = 0; } else { /* Not in cached mode. No need to populate inode with stat */ inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; } d_instantiate(dentry, inode); } error: p9_fid_put(fid); p9_fid_put(dfid); return err; } /** * v9fs_vfs_link_dotl - create a hardlink for dotl * @old_dentry: dentry for file to link to * @dir: inode destination for new link * @dentry: dentry for link * */ static int v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int err; struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n", dir->i_ino, old_dentry, dentry); v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) return PTR_ERR(dfid); oldfid = v9fs_fid_lookup(old_dentry); if (IS_ERR(oldfid)) { p9_fid_put(dfid); return PTR_ERR(oldfid); } err = p9_client_link(dfid, oldfid, dentry->d_name.name); p9_fid_put(dfid); p9_fid_put(oldfid); if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); return err; } v9fs_invalidate_inode_attr(dir); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { /* Get the latest stat info from server. */ struct p9_fid *fid; fid = v9fs_fid_lookup(old_dentry); if (IS_ERR(fid)) return PTR_ERR(fid); v9fs_refresh_inode_dotl(fid, d_inode(old_dentry)); p9_fid_put(fid); } ihold(d_inode(old_dentry)); d_instantiate(dentry, d_inode(old_dentry)); return err; } /** * v9fs_vfs_mknod_dotl - create a special file * @idmap: The idmap of the mount * @dir: inode destination for new link * @dentry: dentry for file * @omode: mode for creation * @rdev: device associated with special file * */ static int v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev) { int err; kgid_t gid; const unsigned char *name; umode_t mode; struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; struct p9_qid qid; struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); goto error; } gid = v9fs_get_fsgid_for_create(dir); mode = omode; /* Update mode based on ACL value */ err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n", err); goto error; } name = dentry->d_name.name; err = p9_client_mknod_dotl(dfid, name, mode, rdev, gid, &qid); if (err < 0) goto error; v9fs_invalidate_inode_attr(dir); fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { err = PTR_ERR(fid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); goto error; } /* instantiate inode and assign the unopened fid to the dentry */ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } v9fs_set_create_acl(inode, fid, dacl, pacl); v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); err = 0; } else { /* * Not in cached mode. No need to populate inode with stat. * socket syscall returns a fd, so we need instantiate */ inode = v9fs_get_inode(dir->i_sb, mode, rdev); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; } v9fs_set_create_acl(inode, fid, dacl, pacl); d_instantiate(dentry, inode); } error: p9_fid_put(fid); v9fs_put_acl(dacl, pacl); p9_fid_put(dfid); return err; } /** * v9fs_vfs_get_link_dotl - follow a symlink path * @dentry: dentry for symlink * @inode: inode for symlink * @done: destructor for return value */ static const char * v9fs_vfs_get_link_dotl(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct p9_fid *fid; char *target; int retval; if (!dentry) return ERR_PTR(-ECHILD); p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); retval = p9_client_readlink(fid, &target); p9_fid_put(fid); if (retval) return ERR_PTR(retval); set_delayed_call(done, kfree_link, target); return target; } int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) { struct p9_stat_dotl *st; struct v9fs_session_info *v9ses; unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_getattr_dotl(fid, P9_STATS_ALL); if (IS_ERR(st)) return PTR_ERR(st); /* * Don't update inode if the file type is different */ if (inode_wrong_type(inode, st->st_mode)) goto out; /* * We don't want to refresh inode->i_size, * because we may have cached data */ flags = (v9ses->cache & CACHE_LOOSE) ? V9FS_STAT2INODE_KEEP_ISIZE : 0; v9fs_stat2inode_dotl(st, inode, flags); out: kfree(st); return 0; } const struct inode_operations v9fs_dir_inode_operations_dotl = { .create = v9fs_vfs_create_dotl, .atomic_open = v9fs_vfs_atomic_open_dotl, .lookup = v9fs_vfs_lookup, .link = v9fs_vfs_link_dotl, .symlink = v9fs_vfs_symlink_dotl, .unlink = v9fs_vfs_unlink, .mkdir = v9fs_vfs_mkdir_dotl, .rmdir = v9fs_vfs_rmdir, .mknod = v9fs_vfs_mknod_dotl, .rename = v9fs_vfs_rename, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, .get_inode_acl = v9fs_iop_get_inode_acl, .get_acl = v9fs_iop_get_acl, .set_acl = v9fs_iop_set_acl, }; const struct inode_operations v9fs_file_inode_operations_dotl = { .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, .get_inode_acl = v9fs_iop_get_inode_acl, .get_acl = v9fs_iop_get_acl, .set_acl = v9fs_iop_set_acl, }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { .get_link = v9fs_vfs_get_link_dotl, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .listxattr = v9fs_listxattr, };
388 1857 756 258 613 532 1 2857 2859 3068 1200 1814 3244 3846 3 27 1 122 999 188 1427 1047 917 2649 348 15 109 16 37 1 37 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/buffer_head.h * * Everything to do with buffer_heads. */ #ifndef _LINUX_BUFFER_HEAD_H #define _LINUX_BUFFER_HEAD_H #include <linux/types.h> #include <linux/blk_types.h> #include <linux/fs.h> #include <linux/linkage.h> #include <linux/pagemap.h> #include <linux/wait.h> #include <linux/atomic.h> enum bh_state_bits { BH_Uptodate, /* Contains valid data */ BH_Dirty, /* Is dirty */ BH_Lock, /* Is locked */ BH_Req, /* Has been submitted for I/O */ BH_Mapped, /* Has a disk mapping */ BH_New, /* Disk mapping was newly created by get_block */ BH_Async_Read, /* Is under end_buffer_async_read I/O */ BH_Async_Write, /* Is under end_buffer_async_write I/O */ BH_Delay, /* Buffer is not yet allocated on disk */ BH_Boundary, /* Block is followed by a discontiguity */ BH_Write_EIO, /* I/O error on write */ BH_Unwritten, /* Buffer is allocated on disk but not written */ BH_Quiet, /* Buffer Error Prinks to be quiet */ BH_Meta, /* Buffer contains metadata */ BH_Prio, /* Buffer should be submitted with REQ_PRIO */ BH_Defer_Completion, /* Defer AIO completion to workqueue */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities */ }; #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512) struct page; struct buffer_head; struct address_space; typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); /* * Historically, a buffer_head was used to map a single block * within a page, and of course as the unit of I/O through the * filesystem and block layers. Nowadays the basic I/O unit * is the bio, and buffer_heads are used for extracting block * mappings (via a get_block_t call), for tracking state within * a page (via a page_mapping) and for wrapping bio submission * for backward compatibility reasons (e.g. submit_bh). */ struct buffer_head { unsigned long b_state; /* buffer state bitmap (see above) */ struct buffer_head *b_this_page;/* circular list of page's buffers */ union { struct page *b_page; /* the page this bh is mapped to */ struct folio *b_folio; /* the folio this bh is mapped to */ }; sector_t b_blocknr; /* start block number */ size_t b_size; /* size of mapping */ char *b_data; /* pointer to data within the page */ struct block_device *b_bdev; bh_end_io_t *b_end_io; /* I/O completion */ void *b_private; /* reserved for b_end_io */ struct list_head b_assoc_buffers; /* associated with another mapping */ struct address_space *b_assoc_map; /* mapping this buffer is associated with */ atomic_t b_count; /* users using this buffer_head */ spinlock_t b_uptodate_lock; /* Used by the first bh in a page, to * serialise IO completion of other * buffers in the page */ }; /* * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() * and buffer_foo() functions. * To avoid reset buffer flags that are already set, because that causes * a costly cache line transition, check the flag first. */ #define BUFFER_FNS(bit, name) \ static __always_inline void set_buffer_##name(struct buffer_head *bh) \ { \ if (!test_bit(BH_##bit, &(bh)->b_state)) \ set_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline void clear_buffer_##name(struct buffer_head *bh) \ { \ clear_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline int buffer_##name(const struct buffer_head *bh) \ { \ return test_bit(BH_##bit, &(bh)->b_state); \ } /* * test_set_buffer_foo() and test_clear_buffer_foo() */ #define TAS_BUFFER_FNS(bit, name) \ static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \ { \ return test_and_set_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \ { \ return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ } \ /* * Emit the buffer bitops functions. Note that there are also functions * of the form "mark_buffer_foo()". These are higher-level functions which * do something in addition to setting a b_state bit. */ BUFFER_FNS(Dirty, dirty) TAS_BUFFER_FNS(Dirty, dirty) BUFFER_FNS(Lock, locked) BUFFER_FNS(Req, req) TAS_BUFFER_FNS(Req, req) BUFFER_FNS(Mapped, mapped) BUFFER_FNS(New, new) BUFFER_FNS(Async_Read, async_read) BUFFER_FNS(Async_Write, async_write) BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) BUFFER_FNS(Write_EIO, write_io_error) BUFFER_FNS(Unwritten, unwritten) BUFFER_FNS(Meta, meta) BUFFER_FNS(Prio, prio) BUFFER_FNS(Defer_Completion, defer_completion) static __always_inline void set_buffer_uptodate(struct buffer_head *bh) { /* * If somebody else already set this uptodate, they will * have done the memory barrier, and a reader will thus * see *some* valid buffer state. * * Any other serialization (with IO errors or whatever that * might clear the bit) has to come from other state (eg BH_Lock). */ if (test_bit(BH_Uptodate, &bh->b_state)) return; /* * make it consistent with folio_mark_uptodate * pairs with smp_load_acquire in buffer_uptodate */ smp_mb__before_atomic(); set_bit(BH_Uptodate, &bh->b_state); } static __always_inline void clear_buffer_uptodate(struct buffer_head *bh) { clear_bit(BH_Uptodate, &bh->b_state); } static __always_inline int buffer_uptodate(const struct buffer_head *bh) { /* * make it consistent with folio_test_uptodate * pairs with smp_mb__before_atomic in set_buffer_uptodate */ return test_bit_acquire(BH_Uptodate, &bh->b_state); } static inline unsigned long bh_offset(const struct buffer_head *bh) { return (unsigned long)(bh)->b_data & (page_size(bh->b_page) - 1); } /* If we *know* page->private refers to buffer_heads */ #define page_buffers(page) \ ({ \ BUG_ON(!PagePrivate(page)); \ ((struct buffer_head *)page_private(page)); \ }) #define page_has_buffers(page) PagePrivate(page) #define folio_buffers(folio) folio_get_private(folio) void buffer_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback); /* * Declarations */ void mark_buffer_dirty(struct buffer_head *bh); void mark_buffer_write_io_error(struct buffer_head *bh); void touch_buffer(struct buffer_head *bh); void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, gfp_t gfp); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, bool datasync); int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, bool datasync); void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len); static inline void clean_bdev_bh_alias(struct buffer_head *bh) { clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1); } void mark_buffer_async_write(struct buffer_head *bh); void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); void __lock_buffer(struct buffer_head *bh); int sync_dirty_buffer(struct buffer_head *bh); int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); void submit_bh(blk_opf_t, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); int bh_uptodate_or_lock(struct buffer_head *bh); int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); void __bh_read_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags, bool force_lock); /* * Generic address_space_operations implementations for buffer_head-backed * address_spaces. */ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, void *get_block); int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc); int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, get_block_t *get_block); int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); void block_commit_write(struct page *page, unsigned int from, unsigned int to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); #ifdef CONFIG_MIGRATION extern int buffer_migrate_folio(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); extern int buffer_migrate_folio_norefs(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); #else #define buffer_migrate_folio NULL #define buffer_migrate_folio_norefs NULL #endif /* * inline definitions */ static inline void get_bh(struct buffer_head *bh) { atomic_inc(&bh->b_count); } static inline void put_bh(struct buffer_head *bh) { smp_mb__before_atomic(); atomic_dec(&bh->b_count); } static inline void brelse(struct buffer_head *bh) { if (bh) __brelse(bh); } static inline void bforget(struct buffer_head *bh) { if (bh) __bforget(bh); } static inline struct buffer_head * sb_bread(struct super_block *sb, sector_t block) { return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); } static inline struct buffer_head * sb_bread_unmovable(struct super_block *sb, sector_t block) { return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0); } static inline void sb_breadahead(struct super_block *sb, sector_t block) { __breadahead(sb->s_bdev, block, sb->s_blocksize); } static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block, unsigned size) { gfp_t gfp; gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); gfp |= __GFP_NOFAIL; return bdev_getblk(bdev, block, size, gfp); } static inline struct buffer_head *__getblk(struct block_device *bdev, sector_t block, unsigned size) { gfp_t gfp; gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); gfp |= __GFP_MOVABLE | __GFP_NOFAIL; return bdev_getblk(bdev, block, size, gfp); } static inline struct buffer_head *sb_getblk(struct super_block *sb, sector_t block) { return __getblk(sb->s_bdev, block, sb->s_blocksize); } static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp) { return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp); } static inline struct buffer_head * sb_find_get_block(struct super_block *sb, sector_t block) { return __find_get_block(sb->s_bdev, block, sb->s_blocksize); } static inline void map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) { set_buffer_mapped(bh); bh->b_bdev = sb->s_bdev; bh->b_blocknr = block; bh->b_size = sb->s_blocksize; } static inline void wait_on_buffer(struct buffer_head *bh) { might_sleep(); if (buffer_locked(bh)) __wait_on_buffer(bh); } static inline int trylock_buffer(struct buffer_head *bh) { return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state)); } static inline void lock_buffer(struct buffer_head *bh) { might_sleep(); if (!trylock_buffer(bh)) __lock_buffer(bh); } static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags) { if (!buffer_uptodate(bh) && trylock_buffer(bh)) { if (!buffer_uptodate(bh)) __bh_read(bh, op_flags, false); else unlock_buffer(bh); } } static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags) { if (!bh_uptodate_or_lock(bh)) __bh_read(bh, op_flags, false); } /* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */ static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags) { if (bh_uptodate_or_lock(bh)) return 1; return __bh_read(bh, op_flags, true); } static inline void bh_read_batch(int nr, struct buffer_head *bhs[]) { __bh_read_batch(nr, bhs, 0, true); } static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags) { __bh_read_batch(nr, bhs, op_flags, false); } /** * __bread() - reads a specified block and returns the bh * @bdev: the block_device to read from * @block: number of block * @size: size (in bytes) to read * * Reads a specified block, and returns buffer head that contains it. * The page cache is allocated from movable area so that it can be migrated. * It returns NULL if the block was unreadable. */ static inline struct buffer_head * __bread(struct block_device *bdev, sector_t block, unsigned size) { return __bread_gfp(bdev, block, size, __GFP_MOVABLE); } /** * get_nth_bh - Get a reference on the n'th buffer after this one. * @bh: The buffer to start counting from. * @count: How many buffers to skip. * * This is primarily useful for finding the nth buffer in a folio; in * that case you pass the head buffer and the byte offset in the folio * divided by the block size. It can be used for other purposes, but * it will wrap at the end of the folio rather than returning NULL or * proceeding to the next folio for you. * * Return: The requested buffer with an elevated refcount. */ static inline __must_check struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count) { while (count--) bh = bh->b_this_page; get_bh(bh); return bh; } bool block_dirty_folio(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_BUFFER_HEAD void buffer_init(void); bool try_to_free_buffers(struct folio *folio); int inode_has_buffers(struct inode *inode); void invalidate_inode_buffers(struct inode *inode); int remove_inode_buffers(struct inode *inode); int sync_mapping_buffers(struct address_space *mapping); void invalidate_bh_lrus(void); void invalidate_bh_lrus_cpu(void); bool has_bh_in_lru(int cpu, void *dummy); extern int buffer_heads_over_limit; #else /* CONFIG_BUFFER_HEAD */ static inline void buffer_init(void) {} static inline bool try_to_free_buffers(struct folio *folio) { return true; } static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline void invalidate_bh_lrus(void) {} static inline void invalidate_bh_lrus_cpu(void) {} static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BUFFER_HEAD */ #endif /* _LINUX_BUFFER_HEAD_H */
27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 /* SPDX-License-Identifier: GPL-2.0 */ /* Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation */ #ifndef _LINUX_KALLSYMS_H #define _LINUX_KALLSYMS_H #include <linux/errno.h> #include <linux/buildid.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/mm.h> #include <linux/module.h> #include <asm/sections.h> #define KSYM_NAME_LEN 512 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + \ (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \ (BUILD_ID_SIZE_MAX * 2) + 1) struct cred; struct module; static inline int is_kernel_text(unsigned long addr) { if (__is_kernel_text(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (__is_kernel(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_ksym_addr(unsigned long addr) { if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); } static inline void *dereference_symbol_descriptor(void *ptr) { #ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS struct module *mod; ptr = dereference_kernel_function_descriptor(ptr); if (is_ksym_addr((unsigned long)ptr)) return ptr; preempt_disable(); mod = __module_address((unsigned long)ptr); preempt_enable(); if (mod) ptr = dereference_module_function_descriptor(mod, ptr); #endif return ptr; } /* How and when do we show kallsyms values? */ extern bool kallsyms_show_value(const struct cred *cred); #ifdef CONFIG_KALLSYMS unsigned long kallsyms_sym_address(int idx); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data); int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data); /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); extern int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset); /* Lookup an address. modname is set to NULL if it's in the kernel. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf); /* Look up a kernel symbol and return it in a text buffer. */ extern int sprint_symbol(char *buffer, unsigned long address); extern int sprint_symbol_build_id(char *buffer, unsigned long address); extern int sprint_symbol_no_offset(char *buffer, unsigned long address); extern int sprint_backtrace(char *buffer, unsigned long address); extern int sprint_backtrace_build_id(char *buffer, unsigned long address); int lookup_symbol_name(unsigned long addr, char *symname); #else /* !CONFIG_KALLSYMS */ static inline unsigned long kallsyms_lookup_name(const char *name) { return 0; } static inline int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { return 0; } static inline const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { return NULL; } static inline int sprint_symbol(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_symbol_build_id(char *buffer, unsigned long address) { *buffer = '\0'; return 0; } static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace_build_id(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int lookup_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long), void *data) { return -EOPNOTSUPP; } static inline int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), const char *name, void *data) { return -EOPNOTSUPP; } #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) { printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/
7 1 6 7 4 3 2 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 // SPDX-License-Identifier: GPL-2.0-only /* * This file contians vfs file ops for 9P2000. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/list.h> #include <linux/pagemap.h> #include <linux/utsname.h> #include <linux/uaccess.h> #include <linux/uio.h> #include <linux/slab.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" #include "cache.h" static const struct vm_operations_struct v9fs_mmap_file_vm_ops; /** * v9fs_file_open - open a file (or directory) * @inode: inode to be opened * @file: file being opened * */ int v9fs_file_open(struct inode *inode, struct file *file) { int err; struct v9fs_session_info *v9ses; struct p9_fid *fid; int omode; p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) omode = v9fs_open_to_dotl_flags(file->f_flags); else omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { fid = v9fs_fid_clone(file_dentry(file)); if (IS_ERR(fid)) return PTR_ERR(fid); if ((v9ses->cache & CACHE_WRITEBACK) && (omode & P9_OWRITE)) { int writeback_omode = (omode & ~P9_OWRITE) | P9_ORDWR; p9_debug(P9_DEBUG_CACHE, "write-only file with writeback enabled, try opening O_RDWR\n"); err = p9_client_open(fid, writeback_omode); if (err < 0) { p9_debug(P9_DEBUG_CACHE, "could not open O_RDWR, disabling caches\n"); err = p9_client_open(fid, omode); fid->mode |= P9L_DIRECT; } } else { err = p9_client_open(fid, omode); } if (err < 0) { p9_fid_put(fid); return err; } if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) generic_file_llseek(file, 0, SEEK_END); file->private_data = fid; } #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(V9FS_I(inode)), file->f_mode & FMODE_WRITE); #endif v9fs_fid_add_modes(fid, v9ses->flags, v9ses->cache, file->f_flags); v9fs_open_fid_add(inode, &fid); return 0; } /** * v9fs_file_lock - lock a file (or directory) * @filp: file to be locked * @cmd: lock command * @fl: file lock structure * * Bugs: this looks like a local only lock, we should extend into 9P * by using open exclusive */ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } return 0; } static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) { struct p9_flock flock; struct p9_fid *fid; uint8_t status = P9_LOCK_ERROR; int res = 0; unsigned char fl_type; struct v9fs_session_info *v9ses; fid = filp->private_data; BUG_ON(fid == NULL); BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX); res = locks_lock_file_wait(filp, fl); if (res < 0) goto out; /* convert posix lock to p9 tlock args */ memset(&flock, 0, sizeof(flock)); /* map the lock type */ switch (fl->fl_type) { case F_RDLCK: flock.type = P9_LOCK_TYPE_RDLCK; break; case F_WRLCK: flock.type = P9_LOCK_TYPE_WRLCK; break; case F_UNLCK: flock.type = P9_LOCK_TYPE_UNLCK; break; } flock.start = fl->fl_start; if (fl->fl_end == OFFSET_MAX) flock.length = 0; else flock.length = fl->fl_end - fl->fl_start + 1; flock.proc_id = fl->fl_pid; flock.client_id = fid->clnt->name; if (IS_SETLKW(cmd)) flock.flags = P9_LOCK_FLAGS_BLOCK; v9ses = v9fs_inode2v9ses(file_inode(filp)); /* * if its a blocked request and we get P9_LOCK_BLOCKED as the status * for lock request, keep on trying */ for (;;) { res = p9_client_lock_dotl(fid, &flock, &status); if (res < 0) goto out_unlock; if (status != P9_LOCK_BLOCKED) break; if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) break; if (schedule_timeout_interruptible(v9ses->session_lock_timeout) != 0) break; /* * p9_client_lock_dotl overwrites flock.client_id with the * server message, free and reuse the client name */ if (flock.client_id != fid->clnt->name) { kfree(flock.client_id); flock.client_id = fid->clnt->name; } } /* map 9p status to VFS status */ switch (status) { case P9_LOCK_SUCCESS: res = 0; break; case P9_LOCK_BLOCKED: res = -EAGAIN; break; default: WARN_ONCE(1, "unknown lock status code: %d\n", status); fallthrough; case P9_LOCK_ERROR: case P9_LOCK_GRACE: res = -ENOLCK; break; } out_unlock: /* * incase server returned error for lock request, revert * it locally */ if (res < 0 && fl->fl_type != F_UNLCK) { fl_type = fl->fl_type; fl->fl_type = F_UNLCK; /* Even if this fails we want to return the remote error */ locks_lock_file_wait(filp, fl); fl->fl_type = fl_type; } if (flock.client_id != fid->clnt->name) kfree(flock.client_id); out: return res; } static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) { struct p9_getlock glock; struct p9_fid *fid; int res = 0; fid = filp->private_data; BUG_ON(fid == NULL); posix_test_lock(filp, fl); /* * if we have a conflicting lock locally, no need to validate * with server */ if (fl->fl_type != F_UNLCK) return res; /* convert posix lock to p9 tgetlock args */ memset(&glock, 0, sizeof(glock)); glock.type = P9_LOCK_TYPE_UNLCK; glock.start = fl->fl_start; if (fl->fl_end == OFFSET_MAX) glock.length = 0; else glock.length = fl->fl_end - fl->fl_start + 1; glock.proc_id = fl->fl_pid; glock.client_id = fid->clnt->name; res = p9_client_getlock_dotl(fid, &glock); if (res < 0) goto out; /* map 9p lock type to os lock type */ switch (glock.type) { case P9_LOCK_TYPE_RDLCK: fl->fl_type = F_RDLCK; break; case P9_LOCK_TYPE_WRLCK: fl->fl_type = F_WRLCK; break; case P9_LOCK_TYPE_UNLCK: fl->fl_type = F_UNLCK; break; } if (glock.type != P9_LOCK_TYPE_UNLCK) { fl->fl_start = glock.start; if (glock.length == 0) fl->fl_end = OFFSET_MAX; else fl->fl_end = glock.start + glock.length - 1; fl->fl_pid = -glock.proc_id; } out: if (glock.client_id != fid->clnt->name) kfree(glock.client_id); return res; } /** * v9fs_file_lock_dotl - lock a file (or directory) * @filp: file to be locked * @cmd: lock command * @fl: file lock structure * */ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); int ret = -ENOLCK; p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } if (IS_SETLK(cmd) || IS_SETLKW(cmd)) ret = v9fs_file_do_lock(filp, cmd, fl); else if (IS_GETLK(cmd)) ret = v9fs_file_getlock(filp, fl); else ret = -EINVAL; return ret; } /** * v9fs_file_flock_dotl - lock a file * @filp: file to be locked * @cmd: lock command * @fl: file lock structure * */ static int v9fs_file_flock_dotl(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); int ret = -ENOLCK; p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); if (!(fl->fl_flags & FL_FLOCK)) goto out_err; if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } /* Convert flock to posix lock */ fl->fl_flags |= FL_POSIX; fl->fl_flags ^= FL_FLOCK; if (IS_SETLK(cmd) | IS_SETLKW(cmd)) ret = v9fs_file_do_lock(filp, cmd, fl); else ret = -EINVAL; out_err: return ret; } /** * v9fs_file_read_iter - read from a file * @iocb: The operation parameters * @to: The buffer to read into * */ static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct p9_fid *fid = iocb->ki_filp->private_data; p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", fid->fid, iov_iter_count(to), iocb->ki_pos); if (fid->mode & P9L_DIRECT) return netfs_unbuffered_read_iter(iocb, to); p9_debug(P9_DEBUG_VFS, "(cached)\n"); return netfs_file_read_iter(iocb, to); } /* * v9fs_file_splice_read - splice-read from a file * @in: The 9p file to read from * @ppos: Where to find/update the file position * @pipe: The pipe to splice into * @len: The maximum amount of data to splice * @flags: SPLICE_F_* flags */ static ssize_t v9fs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct p9_fid *fid = in->private_data; p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", fid->fid, len, *ppos); if (fid->mode & P9L_DIRECT) return copy_splice_read(in, ppos, pipe, len, flags); return filemap_splice_read(in, ppos, pipe, len, flags); } /** * v9fs_file_write_iter - write to a file * @iocb: The operation parameters * @from: The data to write * */ static ssize_t v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct p9_fid *fid = file->private_data; p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid); if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE)) return netfs_unbuffered_write_iter(iocb, from); p9_debug(P9_DEBUG_CACHE, "(cached)\n"); return netfs_file_write_iter(iocb, from); } static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct p9_fid *fid; struct inode *inode = filp->f_mapping->host; struct p9_wstat wstat; int retval; retval = file_write_and_wait_range(filp, start, end); if (retval) return retval; inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); retval = p9_client_wstat(fid, &wstat); inode_unlock(inode); return retval; } int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, int datasync) { struct p9_fid *fid; struct inode *inode = filp->f_mapping->host; int retval; retval = file_write_and_wait_range(filp, start, end); if (retval) return retval; inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; retval = p9_client_fsync(fid, datasync); inode_unlock(inode); return retval; } static int v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) { int retval; struct inode *inode = file_inode(filp); struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); p9_debug(P9_DEBUG_MMAP, "filp :%p\n", filp); if (!(v9ses->cache & CACHE_WRITEBACK)) { p9_debug(P9_DEBUG_CACHE, "(read-only mmap mode)"); return generic_file_readonly_mmap(filp, vma); } retval = generic_file_mmap(filp, vma); if (!retval) vma->vm_ops = &v9fs_mmap_file_vm_ops; return retval; } static vm_fault_t v9fs_vm_page_mkwrite(struct vm_fault *vmf) { return netfs_page_mkwrite(vmf, NULL); } static void v9fs_mmap_vm_close(struct vm_area_struct *vma) { struct inode *inode; struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = WB_SYNC_ALL, .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE, /* absolute end, byte at end included */ .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start - 1), }; if (!(vma->vm_flags & VM_SHARED)) return; p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); inode = file_inode(vma->vm_file); filemap_fdatawrite_wbc(inode->i_mapping, &wbc); } static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .close = v9fs_mmap_vm_close, .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, }; const struct file_operations v9fs_file_operations = { .llseek = generic_file_llseek, .read_iter = v9fs_file_read_iter, .write_iter = v9fs_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock, .mmap = generic_file_readonly_mmap, .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync, }; const struct file_operations v9fs_file_operations_dotl = { .llseek = generic_file_llseek, .read_iter = v9fs_file_read_iter, .write_iter = v9fs_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, .mmap = v9fs_file_mmap, .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, };
564 253 1258 44 17 2 12 564 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H #include <linux/math64.h> #include <vdso/time64.h> typedef __s64 time64_t; typedef __u64 timeu64_t; #include <uapi/linux/time.h> struct timespec64 { time64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; struct itimerspec64 { struct timespec64 it_interval; struct timespec64 it_value; }; /* Parameters used to convert the timespec values: */ #define PSEC_PER_NSEC 1000L /* Located here for timespec[64]_valid_strict */ #define TIME64_MAX ((s64)~((u64)1 << 63)) #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) #define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): * * To prevent setting the time close to the wraparound point time setting * is limited so a reasonable uptime can be accomodated. Uptime of 30 years * should be really sufficient, which means the cutoff is 2232. At that * point the cutoff is just a small part of the larger problem. */ #define TIME_UPTIME_SEC_MAX (30LL * 365 * 24 *3600) #define TIME_SETTOD_SEC_MAX (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX) static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } /* * lhs < rhs: return <0 * lhs == rhs: return 0 * lhs > rhs: return >0 */ static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; if (lhs->tv_sec > rhs->tv_sec) return 1; return lhs->tv_nsec - rhs->tv_nsec; } extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } /* * sub = lhs - rhs, in normalized form */ static inline struct timespec64 timespec64_sub(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec); return ts_delta; } /* * Returns true if the timespec64 is norm, false if denorm: */ static inline bool timespec64_valid(const struct timespec64 *ts) { /* Dates before 1970 are bogus */ if (ts->tv_sec < 0) return false; /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; return true; } static inline bool timespec64_valid_strict(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; return true; } static inline bool timespec64_valid_settod(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */ if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX) return false; return true; } /** * timespec64_to_ns - Convert timespec64 to nanoseconds * @ts: pointer to the timespec64 variable to be converted * * Returns the scalar nanosecond representation of the timespec64 * parameter. */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { /* Prevent multiplication overflow / underflow */ if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; if (ts->tv_sec <= KTIME_SEC_MIN) return KTIME_MIN; return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Returns the timespec64 representation of the nsec parameter. */ extern struct timespec64 ns_to_timespec64(s64 nsec); /** * timespec64_add_ns - Adds nanoseconds to a timespec64 * @a: pointer to timespec64 to be incremented * @ns: unsigned nanoseconds value to be added * * This must always be inlined because its used from the x86-64 vdso, * which cannot call other kernel functions. */ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) { a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); #endif /* _LINUX_TIME64_H */
2 2 1 1 1 1 3 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 // SPDX-License-Identifier: GPL-2.0-or-later /* * ASIX AX8817X based USB 2.0 Ethernet Devices * Copyright (C) 2003-2006 David Hollis <dhollis@davehollis.com> * Copyright (C) 2005 Phil Chang <pchang23@sbcglobal.net> * Copyright (C) 2006 James Painter <jamie.painter@iname.com> * Copyright (c) 2002-2003 TiVo Inc. */ #include "asix.h" #define PHY_MODE_MARVELL 0x0000 #define MII_MARVELL_LED_CTRL 0x0018 #define MII_MARVELL_STATUS 0x001b #define MII_MARVELL_CTRL 0x0014 #define MARVELL_LED_MANUAL 0x0019 #define MARVELL_STATUS_HWCFG 0x0004 #define MARVELL_CTRL_TXDELAY 0x0002 #define MARVELL_CTRL_RXDELAY 0x0080 #define PHY_MODE_RTL8211CL 0x000C #define AX88772A_PHY14H 0x14 #define AX88772A_PHY14H_DEFAULT 0x442C #define AX88772A_PHY15H 0x15 #define AX88772A_PHY15H_DEFAULT 0x03C8 #define AX88772A_PHY16H 0x16 #define AX88772A_PHY16H_DEFAULT 0x4044 struct ax88172_int_data { __le16 res1; u8 link; __le16 res2; u8 status; __le16 res3; } __packed; static void asix_status(struct usbnet *dev, struct urb *urb) { struct ax88172_int_data *event; int link; if (urb->actual_length < 8) return; event = urb->transfer_buffer; link = event->link & 0x01; if (netif_carrier_ok(dev->net) != link) { usbnet_link_change(dev, link, 1); netdev_dbg(dev->net, "Link Status is: %d\n", link); } } static void asix_set_netdev_dev_addr(struct usbnet *dev, u8 *addr) { if (is_valid_ether_addr(addr)) { eth_hw_addr_set(dev->net, addr); } else { netdev_info(dev->net, "invalid hw address, using random\n"); eth_hw_addr_random(dev->net); } } /* Get the PHY Identifier from the PHYSID1 & PHYSID2 MII registers */ static u32 asix_get_phyid(struct usbnet *dev) { int phy_reg; u32 phy_id; int i; /* Poll for the rare case the FW or phy isn't ready yet. */ for (i = 0; i < 100; i++) { phy_reg = asix_mdio_read(dev->net, dev->mii.phy_id, MII_PHYSID1); if (phy_reg < 0) return 0; if (phy_reg != 0 && phy_reg != 0xFFFF) break; mdelay(1); } if (phy_reg <= 0 || phy_reg == 0xFFFF) return 0; phy_id = (phy_reg & 0xffff) << 16; phy_reg = asix_mdio_read(dev->net, dev->mii.phy_id, MII_PHYSID2); if (phy_reg < 0) return 0; phy_id |= (phy_reg & 0xffff); return phy_id; } static u32 asix_get_link(struct net_device *net) { struct usbnet *dev = netdev_priv(net); return mii_link_ok(&dev->mii); } static int asix_ioctl (struct net_device *net, struct ifreq *rq, int cmd) { struct usbnet *dev = netdev_priv(net); return generic_mii_ioctl(&dev->mii, if_mii(rq), cmd, NULL); } /* We need to override some ethtool_ops so we require our own structure so we don't interfere with other usbnet devices that may be connected at the same time. */ static const struct ethtool_ops ax88172_ethtool_ops = { .get_drvinfo = asix_get_drvinfo, .get_link = asix_get_link, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_wol = asix_get_wol, .set_wol = asix_set_wol, .get_eeprom_len = asix_get_eeprom_len, .get_eeprom = asix_get_eeprom, .set_eeprom = asix_set_eeprom, .nway_reset = usbnet_nway_reset, .get_link_ksettings = usbnet_get_link_ksettings_mii, .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static void ax88172_set_multicast(struct net_device *net) { struct usbnet *dev = netdev_priv(net); struct asix_data *data = (struct asix_data *)&dev->data; u8 rx_ctl = 0x8c; if (net->flags & IFF_PROMISC) { rx_ctl |= 0x01; } else if (net->flags & IFF_ALLMULTI || netdev_mc_count(net) > AX_MAX_MCAST) { rx_ctl |= 0x02; } else if (netdev_mc_empty(net)) { /* just broadcast and directed */ } else { /* We use the 20 byte dev->data * for our 8 byte filter buffer * to avoid allocating memory that * is tricky to free later */ struct netdev_hw_addr *ha; u32 crc_bits; memset(data->multi_filter, 0, AX_MCAST_FILTER_SIZE); /* Build the multicast hash filter. */ netdev_for_each_mc_addr(ha, net) { crc_bits = ether_crc(ETH_ALEN, ha->addr) >> 26; data->multi_filter[crc_bits >> 3] |= 1 << (crc_bits & 7); } asix_write_cmd_async(dev, AX_CMD_WRITE_MULTI_FILTER, 0, 0, AX_MCAST_FILTER_SIZE, data->multi_filter); rx_ctl |= 0x10; } asix_write_cmd_async(dev, AX_CMD_WRITE_RX_CTL, rx_ctl, 0, 0, NULL); } static int ax88172_link_reset(struct usbnet *dev) { u8 mode; struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); mode = AX88172_MEDIUM_DEFAULT; if (ecmd.duplex != DUPLEX_FULL) mode |= ~AX88172_MEDIUM_FD; netdev_dbg(dev->net, "ax88172_link_reset() speed: %u duplex: %d setting mode to 0x%04x\n", ethtool_cmd_speed(&ecmd), ecmd.duplex, mode); asix_write_medium_mode(dev, mode, 0); return 0; } static const struct net_device_ops ax88172_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_change_mtu = usbnet_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_eth_ioctl = asix_ioctl, .ndo_set_rx_mode = ax88172_set_multicast, }; static void asix_phy_reset(struct usbnet *dev, unsigned int reset_bits) { unsigned int timeout = 5000; asix_mdio_write(dev->net, dev->mii.phy_id, MII_BMCR, reset_bits); /* give phy_id a chance to process reset */ udelay(500); /* See IEEE 802.3 "22.2.4.1.1 Reset": 500ms max */ while (timeout--) { if (asix_mdio_read(dev->net, dev->mii.phy_id, MII_BMCR) & BMCR_RESET) udelay(100); else return; } netdev_err(dev->net, "BMCR_RESET timeout on phy_id %d\n", dev->mii.phy_id); } static int ax88172_bind(struct usbnet *dev, struct usb_interface *intf) { int ret = 0; u8 buf[ETH_ALEN] = {0}; int i; unsigned long gpio_bits = dev->driver_info->data; usbnet_get_endpoints(dev,intf); /* Toggle the GPIOs in a manufacturer/model specific way */ for (i = 2; i >= 0; i--) { ret = asix_write_cmd(dev, AX_CMD_WRITE_GPIOS, (gpio_bits >> (i * 8)) & 0xff, 0, 0, NULL, 0); if (ret < 0) goto out; msleep(5); } ret = asix_write_rx_ctl(dev, 0x80, 0); if (ret < 0) goto out; /* Get the MAC address */ ret = asix_read_cmd(dev, AX88172_CMD_READ_NODE_ID, 0, 0, ETH_ALEN, buf, 0); if (ret < 0) { netdev_dbg(dev->net, "read AX_CMD_READ_NODE_ID failed: %d\n", ret); goto out; } asix_set_netdev_dev_addr(dev, buf); /* Initialize MII structure */ dev->mii.dev = dev->net; dev->mii.mdio_read = asix_mdio_read; dev->mii.mdio_write = asix_mdio_write; dev->mii.phy_id_mask = 0x3f; dev->mii.reg_num_mask = 0x1f; dev->mii.phy_id = asix_read_phy_addr(dev, true); if (dev->mii.phy_id < 0) return dev->mii.phy_id; dev->net->netdev_ops = &ax88172_netdev_ops; dev->net->ethtool_ops = &ax88172_ethtool_ops; dev->net->needed_headroom = 4; /* cf asix_tx_fixup() */ dev->net->needed_tailroom = 4; /* cf asix_tx_fixup() */ asix_phy_reset(dev, BMCR_RESET); asix_mdio_write(dev->net, dev->mii.phy_id, MII_ADVERTISE, ADVERTISE_ALL | ADVERTISE_CSMA | ADVERTISE_PAUSE_CAP); mii_nway_restart(&dev->mii); return 0; out: return ret; } static void ax88772_ethtool_get_strings(struct net_device *netdev, u32 sset, u8 *data) { switch (sset) { case ETH_SS_TEST: net_selftest_get_strings(data); break; } } static int ax88772_ethtool_get_sset_count(struct net_device *ndev, int sset) { switch (sset) { case ETH_SS_TEST: return net_selftest_get_count(); default: return -EOPNOTSUPP; } } static void ax88772_ethtool_get_pauseparam(struct net_device *ndev, struct ethtool_pauseparam *pause) { struct usbnet *dev = netdev_priv(ndev); struct asix_common_private *priv = dev->driver_priv; phylink_ethtool_get_pauseparam(priv->phylink, pause); } static int ax88772_ethtool_set_pauseparam(struct net_device *ndev, struct ethtool_pauseparam *pause) { struct usbnet *dev = netdev_priv(ndev); struct asix_common_private *priv = dev->driver_priv; return phylink_ethtool_set_pauseparam(priv->phylink, pause); } static const struct ethtool_ops ax88772_ethtool_ops = { .get_drvinfo = asix_get_drvinfo, .get_link = usbnet_get_link, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_wol = asix_get_wol, .set_wol = asix_set_wol, .get_eeprom_len = asix_get_eeprom_len, .get_eeprom = asix_get_eeprom, .set_eeprom = asix_set_eeprom, .nway_reset = phy_ethtool_nway_reset, .get_link_ksettings = phy_ethtool_get_link_ksettings, .set_link_ksettings = phy_ethtool_set_link_ksettings, .self_test = net_selftest, .get_strings = ax88772_ethtool_get_strings, .get_sset_count = ax88772_ethtool_get_sset_count, .get_pauseparam = ax88772_ethtool_get_pauseparam, .set_pauseparam = ax88772_ethtool_set_pauseparam, }; static int ax88772_reset(struct usbnet *dev) { struct asix_data *data = (struct asix_data *)&dev->data; struct asix_common_private *priv = dev->driver_priv; int ret; /* Rewrite MAC address */ ether_addr_copy(data->mac_addr, dev->net->dev_addr); ret = asix_write_cmd(dev, AX_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr, 0); if (ret < 0) goto out; /* Set RX_CTL to default values with 2k buffer, and enable cactus */ ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, 0); if (ret < 0) goto out; ret = asix_write_medium_mode(dev, AX88772_MEDIUM_DEFAULT, 0); if (ret < 0) goto out; phylink_start(priv->phylink); return 0; out: return ret; } static int ax88772_hw_reset(struct usbnet *dev, int in_pm) { struct asix_data *data = (struct asix_data *)&dev->data; struct asix_common_private *priv = dev->driver_priv; u16 rx_ctl; int ret; ret = asix_write_gpio(dev, AX_GPIO_RSE | AX_GPIO_GPO_2 | AX_GPIO_GPO2EN, 5, in_pm); if (ret < 0) goto out; ret = asix_write_cmd(dev, AX_CMD_SW_PHY_SELECT, priv->embd_phy, 0, 0, NULL, in_pm); if (ret < 0) { netdev_dbg(dev->net, "Select PHY #1 failed: %d\n", ret); goto out; } if (priv->embd_phy) { ret = asix_sw_reset(dev, AX_SWRESET_IPPD, in_pm); if (ret < 0) goto out; usleep_range(10000, 11000); ret = asix_sw_reset(dev, AX_SWRESET_CLEAR, in_pm); if (ret < 0) goto out; msleep(60); ret = asix_sw_reset(dev, AX_SWRESET_IPRL | AX_SWRESET_PRL, in_pm); if (ret < 0) goto out; } else { ret = asix_sw_reset(dev, AX_SWRESET_IPPD | AX_SWRESET_PRL, in_pm); if (ret < 0) goto out; } msleep(150); if (in_pm && (!asix_mdio_read_nopm(dev->net, dev->mii.phy_id, MII_PHYSID1))){ ret = -EIO; goto out; } ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, in_pm); if (ret < 0) goto out; ret = asix_write_medium_mode(dev, AX88772_MEDIUM_DEFAULT, in_pm); if (ret < 0) goto out; ret = asix_write_cmd(dev, AX_CMD_WRITE_IPG0, AX88772_IPG0_DEFAULT | AX88772_IPG1_DEFAULT, AX88772_IPG2_DEFAULT, 0, NULL, in_pm); if (ret < 0) { netdev_dbg(dev->net, "Write IPG,IPG1,IPG2 failed: %d\n", ret); goto out; } /* Rewrite MAC address */ ether_addr_copy(data->mac_addr, dev->net->dev_addr); ret = asix_write_cmd(dev, AX_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr, in_pm); if (ret < 0) goto out; /* Set RX_CTL to default values with 2k buffer, and enable cactus */ ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, in_pm); if (ret < 0) goto out; rx_ctl = asix_read_rx_ctl(dev, in_pm); netdev_dbg(dev->net, "RX_CTL is 0x%04x after all initializations\n", rx_ctl); rx_ctl = asix_read_medium_status(dev, in_pm); netdev_dbg(dev->net, "Medium Status is 0x%04x after all initializations\n", rx_ctl); return 0; out: return ret; } static int ax88772a_hw_reset(struct usbnet *dev, int in_pm) { struct asix_data *data = (struct asix_data *)&dev->data; struct asix_common_private *priv = dev->driver_priv; u16 rx_ctl, phy14h, phy15h, phy16h; int ret; ret = asix_write_gpio(dev, AX_GPIO_RSE, 5, in_pm); if (ret < 0) goto out; ret = asix_write_cmd(dev, AX_CMD_SW_PHY_SELECT, priv->embd_phy | AX_PHYSEL_SSEN, 0, 0, NULL, in_pm); if (ret < 0) { netdev_dbg(dev->net, "Select PHY #1 failed: %d\n", ret); goto out; } usleep_range(10000, 11000); ret = asix_sw_reset(dev, AX_SWRESET_IPPD | AX_SWRESET_IPRL, in_pm); if (ret < 0) goto out; usleep_range(10000, 11000); ret = asix_sw_reset(dev, AX_SWRESET_IPRL, in_pm); if (ret < 0) goto out; msleep(160); ret = asix_sw_reset(dev, AX_SWRESET_CLEAR, in_pm); if (ret < 0) goto out; ret = asix_sw_reset(dev, AX_SWRESET_IPRL, in_pm); if (ret < 0) goto out; msleep(200); if (in_pm && (!asix_mdio_read_nopm(dev->net, dev->mii.phy_id, MII_PHYSID1))) { ret = -1; goto out; } if (priv->chipcode == AX_AX88772B_CHIPCODE) { ret = asix_write_cmd(dev, AX_QCTCTRL, 0x8000, 0x8001, 0, NULL, in_pm); if (ret < 0) { netdev_dbg(dev->net, "Write BQ setting failed: %d\n", ret); goto out; } } else if (priv->chipcode == AX_AX88772A_CHIPCODE) { /* Check if the PHY registers have default settings */ phy14h = asix_mdio_read_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY14H); phy15h = asix_mdio_read_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY15H); phy16h = asix_mdio_read_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY16H); netdev_dbg(dev->net, "772a_hw_reset: MR20=0x%x MR21=0x%x MR22=0x%x\n", phy14h, phy15h, phy16h); /* Restore PHY registers default setting if not */ if (phy14h != AX88772A_PHY14H_DEFAULT) asix_mdio_write_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY14H, AX88772A_PHY14H_DEFAULT); if (phy15h != AX88772A_PHY15H_DEFAULT) asix_mdio_write_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY15H, AX88772A_PHY15H_DEFAULT); if (phy16h != AX88772A_PHY16H_DEFAULT) asix_mdio_write_nopm(dev->net, dev->mii.phy_id, AX88772A_PHY16H, AX88772A_PHY16H_DEFAULT); } ret = asix_write_cmd(dev, AX_CMD_WRITE_IPG0, AX88772_IPG0_DEFAULT | AX88772_IPG1_DEFAULT, AX88772_IPG2_DEFAULT, 0, NULL, in_pm); if (ret < 0) { netdev_dbg(dev->net, "Write IPG,IPG1,IPG2 failed: %d\n", ret); goto out; } /* Rewrite MAC address */ memcpy(data->mac_addr, dev->net->dev_addr, ETH_ALEN); ret = asix_write_cmd(dev, AX_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr, in_pm); if (ret < 0) goto out; /* Set RX_CTL to default values with 2k buffer, and enable cactus */ ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, in_pm); if (ret < 0) goto out; ret = asix_write_medium_mode(dev, AX88772_MEDIUM_DEFAULT, in_pm); if (ret < 0) return ret; /* Set RX_CTL to default values with 2k buffer, and enable cactus */ ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, in_pm); if (ret < 0) goto out; rx_ctl = asix_read_rx_ctl(dev, in_pm); netdev_dbg(dev->net, "RX_CTL is 0x%04x after all initializations\n", rx_ctl); rx_ctl = asix_read_medium_status(dev, in_pm); netdev_dbg(dev->net, "Medium Status is 0x%04x after all initializations\n", rx_ctl); return 0; out: return ret; } static const struct net_device_ops ax88772_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_change_mtu = usbnet_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = asix_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_eth_ioctl = phy_do_ioctl_running, .ndo_set_rx_mode = asix_set_multicast, }; static void ax88772_suspend(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; u16 medium; if (netif_running(dev->net)) { rtnl_lock(); phylink_suspend(priv->phylink, false); rtnl_unlock(); } /* Stop MAC operation */ medium = asix_read_medium_status(dev, 1); medium &= ~AX_MEDIUM_RE; asix_write_medium_mode(dev, medium, 1); netdev_dbg(dev->net, "ax88772_suspend: medium=0x%04x\n", asix_read_medium_status(dev, 1)); } static int asix_suspend(struct usb_interface *intf, pm_message_t message) { struct usbnet *dev = usb_get_intfdata(intf); struct asix_common_private *priv = dev->driver_priv; if (priv && priv->suspend) priv->suspend(dev); return usbnet_suspend(intf, message); } static void ax88772_resume(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; int i; for (i = 0; i < 3; i++) if (!priv->reset(dev, 1)) break; if (netif_running(dev->net)) { rtnl_lock(); phylink_resume(priv->phylink); rtnl_unlock(); } } static int asix_resume(struct usb_interface *intf) { struct usbnet *dev = usb_get_intfdata(intf); struct asix_common_private *priv = dev->driver_priv; if (priv && priv->resume) priv->resume(dev); return usbnet_resume(intf); } static int ax88772_init_mdio(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; int ret; priv->mdio = mdiobus_alloc(); if (!priv->mdio) return -ENOMEM; priv->mdio->priv = dev; priv->mdio->read = &asix_mdio_bus_read; priv->mdio->write = &asix_mdio_bus_write; priv->mdio->name = "Asix MDIO Bus"; /* mii bus name is usb-<usb bus number>-<usb device number> */ snprintf(priv->mdio->id, MII_BUS_ID_SIZE, "usb-%03d:%03d", dev->udev->bus->busnum, dev->udev->devnum); ret = mdiobus_register(priv->mdio); if (ret) { netdev_err(dev->net, "Could not register MDIO bus (err %d)\n", ret); mdiobus_free(priv->mdio); priv->mdio = NULL; } return ret; } static void ax88772_mdio_unregister(struct asix_common_private *priv) { mdiobus_unregister(priv->mdio); mdiobus_free(priv->mdio); } static int ax88772_init_phy(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; int ret; priv->phydev = mdiobus_get_phy(priv->mdio, priv->phy_addr); if (!priv->phydev) { netdev_err(dev->net, "Could not find PHY\n"); return -ENODEV; } ret = phylink_connect_phy(priv->phylink, priv->phydev); if (ret) { netdev_err(dev->net, "Could not connect PHY\n"); return ret; } phy_suspend(priv->phydev); priv->phydev->mac_managed_pm = true; phy_attached_info(priv->phydev); if (priv->embd_phy) return 0; /* In case main PHY is not the embedded PHY and MAC is RMII clock * provider, we need to suspend embedded PHY by keeping PLL enabled * (AX_SWRESET_IPPD == 0). */ priv->phydev_int = mdiobus_get_phy(priv->mdio, AX_EMBD_PHY_ADDR); if (!priv->phydev_int) { rtnl_lock(); phylink_disconnect_phy(priv->phylink); rtnl_unlock(); netdev_err(dev->net, "Could not find internal PHY\n"); return -ENODEV; } priv->phydev_int->mac_managed_pm = true; phy_suspend(priv->phydev_int); return 0; } static void ax88772_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) { /* Nothing to do */ } static void ax88772_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { struct usbnet *dev = netdev_priv(to_net_dev(config->dev)); asix_write_medium_mode(dev, 0, 0); usbnet_link_change(dev, false, false); } static void ax88772_mac_link_up(struct phylink_config *config, struct phy_device *phy, unsigned int mode, phy_interface_t interface, int speed, int duplex, bool tx_pause, bool rx_pause) { struct usbnet *dev = netdev_priv(to_net_dev(config->dev)); u16 m = AX_MEDIUM_AC | AX_MEDIUM_RE; m |= duplex ? AX_MEDIUM_FD : 0; switch (speed) { case SPEED_100: m |= AX_MEDIUM_PS; break; case SPEED_10: break; default: return; } if (tx_pause) m |= AX_MEDIUM_TFC; if (rx_pause) m |= AX_MEDIUM_RFC; asix_write_medium_mode(dev, m, 0); usbnet_link_change(dev, true, false); } static const struct phylink_mac_ops ax88772_phylink_mac_ops = { .mac_config = ax88772_mac_config, .mac_link_down = ax88772_mac_link_down, .mac_link_up = ax88772_mac_link_up, }; static int ax88772_phylink_setup(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; phy_interface_t phy_if_mode; struct phylink *phylink; priv->phylink_config.dev = &dev->net->dev; priv->phylink_config.type = PHYLINK_NETDEV; priv->phylink_config.mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE | MAC_10 | MAC_100; __set_bit(PHY_INTERFACE_MODE_INTERNAL, priv->phylink_config.supported_interfaces); __set_bit(PHY_INTERFACE_MODE_RMII, priv->phylink_config.supported_interfaces); if (priv->embd_phy) phy_if_mode = PHY_INTERFACE_MODE_INTERNAL; else phy_if_mode = PHY_INTERFACE_MODE_RMII; phylink = phylink_create(&priv->phylink_config, dev->net->dev.fwnode, phy_if_mode, &ax88772_phylink_mac_ops); if (IS_ERR(phylink)) return PTR_ERR(phylink); priv->phylink = phylink; return 0; } static int ax88772_bind(struct usbnet *dev, struct usb_interface *intf) { struct asix_common_private *priv; u8 buf[ETH_ALEN] = {0}; int ret, i; priv = devm_kzalloc(&dev->udev->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; dev->driver_priv = priv; usbnet_get_endpoints(dev, intf); /* Maybe the boot loader passed the MAC address via device tree */ if (!eth_platform_get_mac_address(&dev->udev->dev, buf)) { netif_dbg(dev, ifup, dev->net, "MAC address read from device tree"); } else { /* Try getting the MAC address from EEPROM */ if (dev->driver_info->data & FLAG_EEPROM_MAC) { for (i = 0; i < (ETH_ALEN >> 1); i++) { ret = asix_read_cmd(dev, AX_CMD_READ_EEPROM, 0x04 + i, 0, 2, buf + i * 2, 0); if (ret < 0) break; } } else { ret = asix_read_cmd(dev, AX_CMD_READ_NODE_ID, 0, 0, ETH_ALEN, buf, 0); } if (ret < 0) { netdev_dbg(dev->net, "Failed to read MAC address: %d\n", ret); return ret; } } asix_set_netdev_dev_addr(dev, buf); dev->net->netdev_ops = &ax88772_netdev_ops; dev->net->ethtool_ops = &ax88772_ethtool_ops; dev->net->needed_headroom = 4; /* cf asix_tx_fixup() */ dev->net->needed_tailroom = 4; /* cf asix_tx_fixup() */ ret = asix_read_phy_addr(dev, true); if (ret < 0) return ret; priv->phy_addr = ret; priv->embd_phy = ((priv->phy_addr & 0x1f) == AX_EMBD_PHY_ADDR); ret = asix_read_cmd(dev, AX_CMD_STATMNGSTS_REG, 0, 0, 1, &priv->chipcode, 0); if (ret < 0) { netdev_dbg(dev->net, "Failed to read STATMNGSTS_REG: %d\n", ret); return ret; } priv->chipcode &= AX_CHIPCODE_MASK; priv->resume = ax88772_resume; priv->suspend = ax88772_suspend; if (priv->chipcode == AX_AX88772_CHIPCODE) priv->reset = ax88772_hw_reset; else priv->reset = ax88772a_hw_reset; ret = priv->reset(dev, 0); if (ret < 0) { netdev_dbg(dev->net, "Failed to reset AX88772: %d\n", ret); return ret; } /* Asix framing packs multiple eth frames into a 2K usb bulk transfer */ if (dev->driver_info->flags & FLAG_FRAMING_AX) { /* hard_mtu is still the default - the device does not support jumbo eth frames */ dev->rx_urb_size = 2048; } priv->presvd_phy_bmcr = 0; priv->presvd_phy_advertise = 0; ret = ax88772_init_mdio(dev); if (ret) goto mdio_err; ret = ax88772_phylink_setup(dev); if (ret) goto phylink_err; ret = ax88772_init_phy(dev); if (ret) goto initphy_err; return 0; initphy_err: phylink_destroy(priv->phylink); phylink_err: ax88772_mdio_unregister(priv); mdio_err: return ret; } static int ax88772_stop(struct usbnet *dev) { struct asix_common_private *priv = dev->driver_priv; phylink_stop(priv->phylink); return 0; } static void ax88772_unbind(struct usbnet *dev, struct usb_interface *intf) { struct asix_common_private *priv = dev->driver_priv; rtnl_lock(); phylink_disconnect_phy(priv->phylink); rtnl_unlock(); phylink_destroy(priv->phylink); ax88772_mdio_unregister(priv); asix_rx_fixup_common_free(dev->driver_priv); } static void ax88178_unbind(struct usbnet *dev, struct usb_interface *intf) { asix_rx_fixup_common_free(dev->driver_priv); kfree(dev->driver_priv); } static const struct ethtool_ops ax88178_ethtool_ops = { .get_drvinfo = asix_get_drvinfo, .get_link = asix_get_link, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_wol = asix_get_wol, .set_wol = asix_set_wol, .get_eeprom_len = asix_get_eeprom_len, .get_eeprom = asix_get_eeprom, .set_eeprom = asix_set_eeprom, .nway_reset = usbnet_nway_reset, .get_link_ksettings = usbnet_get_link_ksettings_mii, .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static int marvell_phy_init(struct usbnet *dev) { struct asix_data *data = (struct asix_data *)&dev->data; u16 reg; netdev_dbg(dev->net, "marvell_phy_init()\n"); reg = asix_mdio_read(dev->net, dev->mii.phy_id, MII_MARVELL_STATUS); netdev_dbg(dev->net, "MII_MARVELL_STATUS = 0x%04x\n", reg); asix_mdio_write(dev->net, dev->mii.phy_id, MII_MARVELL_CTRL, MARVELL_CTRL_RXDELAY | MARVELL_CTRL_TXDELAY); if (data->ledmode) { reg = asix_mdio_read(dev->net, dev->mii.phy_id, MII_MARVELL_LED_CTRL); netdev_dbg(dev->net, "MII_MARVELL_LED_CTRL (1) = 0x%04x\n", reg); reg &= 0xf8ff; reg |= (1 + 0x0100); asix_mdio_write(dev->net, dev->mii.phy_id, MII_MARVELL_LED_CTRL, reg); reg = asix_mdio_read(dev->net, dev->mii.phy_id, MII_MARVELL_LED_CTRL); netdev_dbg(dev->net, "MII_MARVELL_LED_CTRL (2) = 0x%04x\n", reg); } return 0; } static int rtl8211cl_phy_init(struct usbnet *dev) { struct asix_data *data = (struct asix_data *)&dev->data; netdev_dbg(dev->net, "rtl8211cl_phy_init()\n"); asix_mdio_write (dev->net, dev->mii.phy_id, 0x1f, 0x0005); asix_mdio_write (dev->net, dev->mii.phy_id, 0x0c, 0); asix_mdio_write (dev->net, dev->mii.phy_id, 0x01, asix_mdio_read (dev->net, dev->mii.phy_id, 0x01) | 0x0080); asix_mdio_write (dev->net, dev->mii.phy_id, 0x1f, 0); if (data->ledmode == 12) { asix_mdio_write (dev->net, dev->mii.phy_id, 0x1f, 0x0002); asix_mdio_write (dev->net, dev->mii.phy_id, 0x1a, 0x00cb); asix_mdio_write (dev->net, dev->mii.phy_id, 0x1f, 0); } return 0; } static int marvell_led_status(struct usbnet *dev, u16 speed) { u16 reg = asix_mdio_read(dev->net, dev->mii.phy_id, MARVELL_LED_MANUAL); netdev_dbg(dev->net, "marvell_led_status() read 0x%04x\n", reg); /* Clear out the center LED bits - 0x03F0 */ reg &= 0xfc0f; switch (speed) { case SPEED_1000: reg |= 0x03e0; break; case SPEED_100: reg |= 0x03b0; break; default: reg |= 0x02f0; } netdev_dbg(dev->net, "marvell_led_status() writing 0x%04x\n", reg); asix_mdio_write(dev->net, dev->mii.phy_id, MARVELL_LED_MANUAL, reg); return 0; } static int ax88178_reset(struct usbnet *dev) { struct asix_data *data = (struct asix_data *)&dev->data; int ret; __le16 eeprom; u8 status; int gpio0 = 0; u32 phyid; ret = asix_read_cmd(dev, AX_CMD_READ_GPIOS, 0, 0, 1, &status, 0); if (ret < 0) { netdev_dbg(dev->net, "Failed to read GPIOS: %d\n", ret); return ret; } netdev_dbg(dev->net, "GPIO Status: 0x%04x\n", status); asix_write_cmd(dev, AX_CMD_WRITE_ENABLE, 0, 0, 0, NULL, 0); ret = asix_read_cmd(dev, AX_CMD_READ_EEPROM, 0x0017, 0, 2, &eeprom, 0); if (ret < 0) { netdev_dbg(dev->net, "Failed to read EEPROM: %d\n", ret); return ret; } asix_write_cmd(dev, AX_CMD_WRITE_DISABLE, 0, 0, 0, NULL, 0); netdev_dbg(dev->net, "EEPROM index 0x17 is 0x%04x\n", eeprom); if (eeprom == cpu_to_le16(0xffff)) { data->phymode = PHY_MODE_MARVELL; data->ledmode = 0; gpio0 = 1; } else { data->phymode = le16_to_cpu(eeprom) & 0x7F; data->ledmode = le16_to_cpu(eeprom) >> 8; gpio0 = (le16_to_cpu(eeprom) & 0x80) ? 0 : 1; } netdev_dbg(dev->net, "GPIO0: %d, PhyMode: %d\n", gpio0, data->phymode); /* Power up external GigaPHY through AX88178 GPIO pin */ asix_write_gpio(dev, AX_GPIO_RSE | AX_GPIO_GPO_1 | AX_GPIO_GPO1EN, 40, 0); if ((le16_to_cpu(eeprom) >> 8) != 1) { asix_write_gpio(dev, 0x003c, 30, 0); asix_write_gpio(dev, 0x001c, 300, 0); asix_write_gpio(dev, 0x003c, 30, 0); } else { netdev_dbg(dev->net, "gpio phymode == 1 path\n"); asix_write_gpio(dev, AX_GPIO_GPO1EN, 30, 0); asix_write_gpio(dev, AX_GPIO_GPO1EN | AX_GPIO_GPO_1, 30, 0); } /* Read PHYID register *AFTER* powering up PHY */ phyid = asix_get_phyid(dev); netdev_dbg(dev->net, "PHYID=0x%08x\n", phyid); /* Set AX88178 to enable MII/GMII/RGMII interface for external PHY */ asix_write_cmd(dev, AX_CMD_SW_PHY_SELECT, 0, 0, 0, NULL, 0); asix_sw_reset(dev, 0, 0); msleep(150); asix_sw_reset(dev, AX_SWRESET_PRL | AX_SWRESET_IPPD, 0); msleep(150); asix_write_rx_ctl(dev, 0, 0); if (data->phymode == PHY_MODE_MARVELL) { marvell_phy_init(dev); msleep(60); } else if (data->phymode == PHY_MODE_RTL8211CL) rtl8211cl_phy_init(dev); asix_phy_reset(dev, BMCR_RESET | BMCR_ANENABLE); asix_mdio_write(dev->net, dev->mii.phy_id, MII_ADVERTISE, ADVERTISE_ALL | ADVERTISE_CSMA | ADVERTISE_PAUSE_CAP); asix_mdio_write(dev->net, dev->mii.phy_id, MII_CTRL1000, ADVERTISE_1000FULL); asix_write_medium_mode(dev, AX88178_MEDIUM_DEFAULT, 0); mii_nway_restart(&dev->mii); /* Rewrite MAC address */ memcpy(data->mac_addr, dev->net->dev_addr, ETH_ALEN); ret = asix_write_cmd(dev, AX_CMD_WRITE_NODE_ID, 0, 0, ETH_ALEN, data->mac_addr, 0); if (ret < 0) return ret; ret = asix_write_rx_ctl(dev, AX_DEFAULT_RX_CTL, 0); if (ret < 0) return ret; return 0; } static int ax88178_link_reset(struct usbnet *dev) { u16 mode; struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET }; struct asix_data *data = (struct asix_data *)&dev->data; u32 speed; netdev_dbg(dev->net, "ax88178_link_reset()\n"); mii_check_media(&dev->mii, 1, 1); mii_ethtool_gset(&dev->mii, &ecmd); mode = AX88178_MEDIUM_DEFAULT; speed = ethtool_cmd_speed(&ecmd); if (speed == SPEED_1000) mode |= AX_MEDIUM_GM; else if (speed == SPEED_100) mode |= AX_MEDIUM_PS; else mode &= ~(AX_MEDIUM_PS | AX_MEDIUM_GM); mode |= AX_MEDIUM_ENCK; if (ecmd.duplex == DUPLEX_FULL) mode |= AX_MEDIUM_FD; else mode &= ~AX_MEDIUM_FD; netdev_dbg(dev->net, "ax88178_link_reset() speed: %u duplex: %d setting mode to 0x%04x\n", speed, ecmd.duplex, mode); asix_write_medium_mode(dev, mode, 0); if (data->phymode == PHY_MODE_MARVELL && data->ledmode) marvell_led_status(dev, speed); return 0; } static void ax88178_set_mfb(struct usbnet *dev) { u16 mfb = AX_RX_CTL_MFB_16384; u16 rxctl; u16 medium; int old_rx_urb_size = dev->rx_urb_size; if (dev->hard_mtu < 2048) { dev->rx_urb_size = 2048; mfb = AX_RX_CTL_MFB_2048; } else if (dev->hard_mtu < 4096) { dev->rx_urb_size = 4096; mfb = AX_RX_CTL_MFB_4096; } else if (dev->hard_mtu < 8192) { dev->rx_urb_size = 8192; mfb = AX_RX_CTL_MFB_8192; } else if (dev->hard_mtu < 16384) { dev->rx_urb_size = 16384; mfb = AX_RX_CTL_MFB_16384; } rxctl = asix_read_rx_ctl(dev, 0); asix_write_rx_ctl(dev, (rxctl & ~AX_RX_CTL_MFB_16384) | mfb, 0); medium = asix_read_medium_status(dev, 0); if (dev->net->mtu > 1500) medium |= AX_MEDIUM_JFE; else medium &= ~AX_MEDIUM_JFE; asix_write_medium_mode(dev, medium, 0); if (dev->rx_urb_size > old_rx_urb_size) usbnet_unlink_rx_urbs(dev); } static int ax88178_change_mtu(struct net_device *net, int new_mtu) { struct usbnet *dev = netdev_priv(net); int ll_mtu = new_mtu + net->hard_header_len + 4; netdev_dbg(dev->net, "ax88178_change_mtu() new_mtu=%d\n", new_mtu); if ((ll_mtu % dev->maxpacket) == 0) return -EDOM; net->mtu = new_mtu; dev->hard_mtu = net->mtu + net->hard_header_len; ax88178_set_mfb(dev); /* max qlen depend on hard_mtu and rx_urb_size */ usbnet_update_max_qlen(dev); return 0; } static const struct net_device_ops ax88178_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = asix_set_mac_address, .ndo_validate_addr = eth_validate_addr, .ndo_set_rx_mode = asix_set_multicast, .ndo_eth_ioctl = asix_ioctl, .ndo_change_mtu = ax88178_change_mtu, }; static int ax88178_bind(struct usbnet *dev, struct usb_interface *intf) { int ret; u8 buf[ETH_ALEN] = {0}; usbnet_get_endpoints(dev,intf); /* Get the MAC address */ ret = asix_read_cmd(dev, AX_CMD_READ_NODE_ID, 0, 0, ETH_ALEN, buf, 0); if (ret < 0) { netdev_dbg(dev->net, "Failed to read MAC address: %d\n", ret); return ret; } asix_set_netdev_dev_addr(dev, buf); /* Initialize MII structure */ dev->mii.dev = dev->net; dev->mii.mdio_read = asix_mdio_read; dev->mii.mdio_write = asix_mdio_write; dev->mii.phy_id_mask = 0x1f; dev->mii.reg_num_mask = 0xff; dev->mii.supports_gmii = 1; dev->mii.phy_id = asix_read_phy_addr(dev, true); if (dev->mii.phy_id < 0) return dev->mii.phy_id; dev->net->netdev_ops = &ax88178_netdev_ops; dev->net->ethtool_ops = &ax88178_ethtool_ops; dev->net->max_mtu = 16384 - (dev->net->hard_header_len + 4); /* Blink LEDS so users know driver saw dongle */ asix_sw_reset(dev, 0, 0); msleep(150); asix_sw_reset(dev, AX_SWRESET_PRL | AX_SWRESET_IPPD, 0); msleep(150); /* Asix framing packs multiple eth frames into a 2K usb bulk transfer */ if (dev->driver_info->flags & FLAG_FRAMING_AX) { /* hard_mtu is still the default - the device does not support jumbo eth frames */ dev->rx_urb_size = 2048; } dev->driver_priv = kzalloc(sizeof(struct asix_common_private), GFP_KERNEL); if (!dev->driver_priv) return -ENOMEM; return 0; } static const struct driver_info ax8817x_info = { .description = "ASIX AX8817x USB 2.0 Ethernet", .bind = ax88172_bind, .status = asix_status, .link_reset = ax88172_link_reset, .reset = ax88172_link_reset, .flags = FLAG_ETHER | FLAG_LINK_INTR, .data = 0x00130103, }; static const struct driver_info dlink_dub_e100_info = { .description = "DLink DUB-E100 USB Ethernet", .bind = ax88172_bind, .status = asix_status, .link_reset = ax88172_link_reset, .reset = ax88172_link_reset, .flags = FLAG_ETHER | FLAG_LINK_INTR, .data = 0x009f9d9f, }; static const struct driver_info netgear_fa120_info = { .description = "Netgear FA-120 USB Ethernet", .bind = ax88172_bind, .status = asix_status, .link_reset = ax88172_link_reset, .reset = ax88172_link_reset, .flags = FLAG_ETHER | FLAG_LINK_INTR, .data = 0x00130103, }; static const struct driver_info hawking_uf200_info = { .description = "Hawking UF200 USB Ethernet", .bind = ax88172_bind, .status = asix_status, .link_reset = ax88172_link_reset, .reset = ax88172_link_reset, .flags = FLAG_ETHER | FLAG_LINK_INTR, .data = 0x001f1d1f, }; static const struct driver_info ax88772_info = { .description = "ASIX AX88772 USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, }; static const struct driver_info ax88772b_info = { .description = "ASIX AX88772B USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, }; static const struct driver_info lxausb_t1l_info = { .description = "Linux Automation GmbH USB 10Base-T1L", .bind = ax88772_bind, .unbind = ax88772_unbind, .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, }; static const struct driver_info ax88178_info = { .description = "ASIX AX88178 USB 2.0 Ethernet", .bind = ax88178_bind, .unbind = ax88178_unbind, .status = asix_status, .link_reset = ax88178_link_reset, .reset = ax88178_reset, .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, }; /* * USBLINK 20F9 "USB 2.0 LAN" USB ethernet adapter, typically found in * no-name packaging. * USB device strings are: * 1: Manufacturer: USBLINK * 2: Product: HG20F9 USB2.0 * 3: Serial: 000003 * Appears to be compatible with Asix 88772B. */ static const struct driver_info hg20f9_info = { .description = "HG20F9 USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, .status = asix_status, .reset = ax88772_reset, .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, }; static const struct usb_device_id products [] = { { // Linksys USB200M USB_DEVICE (0x077b, 0x2226), .driver_info = (unsigned long) &ax8817x_info, }, { // Netgear FA120 USB_DEVICE (0x0846, 0x1040), .driver_info = (unsigned long) &netgear_fa120_info, }, { // DLink DUB-E100 USB_DEVICE (0x2001, 0x1a00), .driver_info = (unsigned long) &dlink_dub_e100_info, }, { // Intellinet, ST Lab USB Ethernet USB_DEVICE (0x0b95, 0x1720), .driver_info = (unsigned long) &ax8817x_info, }, { // Hawking UF200, TrendNet TU2-ET100 USB_DEVICE (0x07b8, 0x420a), .driver_info = (unsigned long) &hawking_uf200_info, }, { // Billionton Systems, USB2AR USB_DEVICE (0x08dd, 0x90ff), .driver_info = (unsigned long) &ax8817x_info, }, { // Billionton Systems, GUSB2AM-1G-B USB_DEVICE(0x08dd, 0x0114), .driver_info = (unsigned long) &ax88178_info, }, { // ATEN UC210T USB_DEVICE (0x0557, 0x2009), .driver_info = (unsigned long) &ax8817x_info, }, { // Buffalo LUA-U2-KTX USB_DEVICE (0x0411, 0x003d), .driver_info = (unsigned long) &ax8817x_info, }, { // Buffalo LUA-U2-GT 10/100/1000 USB_DEVICE (0x0411, 0x006e), .driver_info = (unsigned long) &ax88178_info, }, { // Sitecom LN-029 "USB 2.0 10/100 Ethernet adapter" USB_DEVICE (0x6189, 0x182d), .driver_info = (unsigned long) &ax8817x_info, }, { // Sitecom LN-031 "USB 2.0 10/100/1000 Ethernet adapter" USB_DEVICE (0x0df6, 0x0056), .driver_info = (unsigned long) &ax88178_info, }, { // Sitecom LN-028 "USB 2.0 10/100/1000 Ethernet adapter" USB_DEVICE (0x0df6, 0x061c), .driver_info = (unsigned long) &ax88178_info, }, { // corega FEther USB2-TX USB_DEVICE (0x07aa, 0x0017), .driver_info = (unsigned long) &ax8817x_info, }, { // Surecom EP-1427X-2 USB_DEVICE (0x1189, 0x0893), .driver_info = (unsigned long) &ax8817x_info, }, { // goodway corp usb gwusb2e USB_DEVICE (0x1631, 0x6200), .driver_info = (unsigned long) &ax8817x_info, }, { // JVC MP-PRX1 Port Replicator USB_DEVICE (0x04f1, 0x3008), .driver_info = (unsigned long) &ax8817x_info, }, { // Lenovo U2L100P 10/100 USB_DEVICE (0x17ef, 0x7203), .driver_info = (unsigned long)&ax88772b_info, }, { // ASIX AX88772B 10/100 USB_DEVICE (0x0b95, 0x772b), .driver_info = (unsigned long) &ax88772b_info, }, { // ASIX AX88772 10/100 USB_DEVICE (0x0b95, 0x7720), .driver_info = (unsigned long) &ax88772_info, }, { // ASIX AX88178 10/100/1000 USB_DEVICE (0x0b95, 0x1780), .driver_info = (unsigned long) &ax88178_info, }, { // Logitec LAN-GTJ/U2A USB_DEVICE (0x0789, 0x0160), .driver_info = (unsigned long) &ax88178_info, }, { // Linksys USB200M Rev 2 USB_DEVICE (0x13b1, 0x0018), .driver_info = (unsigned long) &ax88772_info, }, { // 0Q0 cable ethernet USB_DEVICE (0x1557, 0x7720), .driver_info = (unsigned long) &ax88772_info, }, { // DLink DUB-E100 H/W Ver B1 USB_DEVICE (0x07d1, 0x3c05), .driver_info = (unsigned long) &ax88772_info, }, { // DLink DUB-E100 H/W Ver B1 Alternate USB_DEVICE (0x2001, 0x3c05), .driver_info = (unsigned long) &ax88772_info, }, { // DLink DUB-E100 H/W Ver C1 USB_DEVICE (0x2001, 0x1a02), .driver_info = (unsigned long) &ax88772_info, }, { // Linksys USB1000 USB_DEVICE (0x1737, 0x0039), .driver_info = (unsigned long) &ax88178_info, }, { // IO-DATA ETG-US2 USB_DEVICE (0x04bb, 0x0930), .driver_info = (unsigned long) &ax88178_info, }, { // Belkin F5D5055 USB_DEVICE(0x050d, 0x5055), .driver_info = (unsigned long) &ax88178_info, }, { // Apple USB Ethernet Adapter USB_DEVICE(0x05ac, 0x1402), .driver_info = (unsigned long) &ax88772_info, }, { // Cables-to-Go USB Ethernet Adapter USB_DEVICE(0x0b95, 0x772a), .driver_info = (unsigned long) &ax88772_info, }, { // ABOCOM for pci USB_DEVICE(0x14ea, 0xab11), .driver_info = (unsigned long) &ax88178_info, }, { // ASIX 88772a USB_DEVICE(0x0db0, 0xa877), .driver_info = (unsigned long) &ax88772_info, }, { // Asus USB Ethernet Adapter USB_DEVICE (0x0b95, 0x7e2b), .driver_info = (unsigned long)&ax88772b_info, }, { /* ASIX 88172a demo board */ USB_DEVICE(0x0b95, 0x172a), .driver_info = (unsigned long) &ax88172a_info, }, { /* * USBLINK HG20F9 "USB 2.0 LAN" * Appears to have gazumped Linksys's manufacturer ID but * doesn't (yet) conflict with any known Linksys product. */ USB_DEVICE(0x066b, 0x20f9), .driver_info = (unsigned long) &hg20f9_info, }, { // Linux Automation GmbH USB 10Base-T1L USB_DEVICE(0x33f7, 0x0004), .driver_info = (unsigned long) &lxausb_t1l_info, }, { }, // END }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver asix_driver = { .name = DRIVER_NAME, .id_table = products, .probe = usbnet_probe, .suspend = asix_suspend, .resume = asix_resume, .reset_resume = asix_resume, .disconnect = usbnet_disconnect, .supports_autosuspend = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(asix_driver); MODULE_AUTHOR("David Hollis"); MODULE_VERSION(DRIVER_VERSION); MODULE_DESCRIPTION("ASIX AX8817X based USB 2.0 Ethernet Devices"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ /* 6 bucketsize, initval support added */ #define IPSET_TYPE_REV_MAX 7 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ #define HTYPE hash_ipport #define IP_SET_HASH_WITH_NETMASK #define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ /* Member elements */ struct hash_ipport4_elem { __be32 ip; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, const struct hash_ipport4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipport4_data_list(struct sk_buff *skb, const struct hash_ipport4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipport4_data_next(struct hash_ipport4_elem *next, const struct hash_ipport4_elem *d) { next->ip = d->ip; next->port = d->port; } #define MTYPE hash_ipport4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); const struct MTYPE *h = set->data; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= h->bitmask.ip; if (e.ip == 0) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipport4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.ip &= h->bitmask.ip; if (e.ip == 0) return -EINVAL; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); if (i > IPSET_MAX_RANGE) { hash_ipport4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } } return ret; } /* IPv6 variant */ struct hash_ipport6_elem { union nf_inet_addr ip; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, const struct hash_ipport6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipport6_data_list(struct sk_buff *skb, const struct hash_ipport6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipport6_data_next(struct hash_ipport6_elem *next, const struct hash_ipport6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipport6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); const struct MTYPE *h = set->data; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipport6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipport_type __read_mostly = { .name = "hash:ip,port", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipport_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipport_init(void) { return ip_set_type_register(&hash_ipport_type); } static void __exit hash_ipport_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipport_type); } module_init(hash_ipport_init); module_exit(hash_ipport_fini);
3 2 1 5 2 2 1 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <uapi/linux/fadvise.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "advise.h" struct io_fadvise { struct file *file; u64 offset; u32 len; u32 advice; }; struct io_madvise { struct file *file; u64 addr; u32 len; u32 advice; }; int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); if (sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; ma->addr = READ_ONCE(sqe->addr); ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); req->flags |= REQ_F_FORCE_ASYNC; return 0; #else return -EOPNOTSUPP; #endif } int io_madvise(struct io_kiocb *req, unsigned int issue_flags) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); return IOU_OK; #else return -EOPNOTSUPP; #endif } static bool io_fadvise_force_async(struct io_fadvise *fa) { switch (fa->advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: case POSIX_FADV_SEQUENTIAL: return false; default: return true; } } int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; fa->offset = READ_ONCE(sqe->off); fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); if (io_fadvise_force_async(fa)) req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK && io_fadvise_force_async(fa)); ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; }
92 20 56 56 56 56 50 7 7 56 8 20 5 9 1 3 7 8 37 33 4 23 24 23 35 35 10 3 24 2 49 50 76 76 184 185 181 5 186 19 74 209 32 196 76 61 20 16 2 14 55 11 50 34 17 49 2 5 3 6 6 3 6 6 6 1 6 5 1 6 6 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/util.c * Copyright (C) 1992 Krishna Balasubramanian * * Sep 1997 - Call suser() last after "normal" permission checks so we * get BSD style process accounting right. * Occurs in several places in the IPC code. * Chris Evans, <chris@ferret.lmh.ox.ac.uk> * Nov 1999 - ipc helper functions, unified SMP locking * Manfred Spraul <manfred@colorfullife.com> * Oct 2002 - One lock per IPC id. RCU ipc_free for lock-free grow_ary(). * Mingming Cao <cmm@us.ibm.com> * Mar 2006 - support for audit of ipc object properties * Dustin Kirkland <dustin.kirkland@us.ibm.com> * Jun 2006 - namespaces ssupport * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> * * General sysv ipc locking scheme: * rcu_read_lock() * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr * tree. * - perform initial checks (capabilities, auditing and permission, * etc). * - perform read-only operations, such as INFO command, that * do not demand atomicity * acquire the ipc lock (kern_ipc_perm.lock) through * ipc_lock_object() * - perform read-only operations that demand atomicity, * such as STAT command. * - perform data updates, such as SET, RMID commands and * mechanism-specific operations (semop/semtimedop, * msgsnd/msgrcv, shmat/shmdt). * drop the ipc lock, through ipc_unlock_object(). * rcu_read_unlock() * * The ids->rwsem must be taken when: * - creating, removing and iterating the existing entries in ipc * identifier sets. * - iterating through files under /proc/sysvipc/ * * Note that sems have a special fast path that avoids kern_ipc_perm.lock - * see sem_lock(). */ #include <linux/mm.h> #include <linux/shm.h> #include <linux/init.h> #include <linux/msg.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/capability.h> #include <linux/highuid.h> #include <linux/security.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/audit.h> #include <linux/nsproxy.h> #include <linux/rwsem.h> #include <linux/memory.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> #include <linux/log2.h> #include <asm/unistd.h> #include "util.h" struct ipc_proc_iface { const char *path; const char *header; int ids; int (*show)(struct seq_file *, void *); }; /** * ipc_init - initialise ipc subsystem * * The various sysv ipc resources (semaphores, messages and shared * memory) are initialised. * * A callback routine is registered into the memory hotplug notifier * chain: since msgmni scales to lowmem this callback routine will be * called upon successful memory add / remove to recompute msmgni. */ static int __init ipc_init(void) { proc_mkdir("sysvipc", NULL); sem_init(); msg_init(); shm_init(); return 0; } device_initcall(ipc_init); static const struct rhashtable_params ipc_kht_params = { .head_offset = offsetof(struct kern_ipc_perm, khtnode), .key_offset = offsetof(struct kern_ipc_perm, key), .key_len = sizeof_field(struct kern_ipc_perm, key), .automatic_shrinking = true, }; /** * ipc_init_ids - initialise ipc identifiers * @ids: ipc identifier set * * Set up the sequence range to use for the ipc identifier range (limited * below ipc_mni) then initialise the keys hashtable and ids idr. */ void ipc_init_ids(struct ipc_ids *ids) { ids->in_use = 0; ids->seq = 0; init_rwsem(&ids->rwsem); rhashtable_init(&ids->key_ht, &ipc_kht_params); idr_init(&ids->ipcs_idr); ids->max_idx = -1; ids->last_idx = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; #endif } #ifdef CONFIG_PROC_FS static const struct proc_ops sysvipc_proc_ops; /** * ipc_init_proc_interface - create a proc interface for sysipc types using a seq_file interface. * @path: Path in procfs * @header: Banner to be printed at the beginning of the file. * @ids: ipc id table to iterate. * @show: show routine. */ void __init ipc_init_proc_interface(const char *path, const char *header, int ids, int (*show)(struct seq_file *, void *)) { struct proc_dir_entry *pde; struct ipc_proc_iface *iface; iface = kmalloc(sizeof(*iface), GFP_KERNEL); if (!iface) return; iface->path = path; iface->header = header; iface->ids = ids; iface->show = show; pde = proc_create_data(path, S_IRUGO, /* world readable */ NULL, /* parent dir */ &sysvipc_proc_ops, iface); if (!pde) kfree(iface); } #endif /** * ipc_findkey - find a key in an ipc identifier set * @ids: ipc identifier set * @key: key to find * * Returns the locked pointer to the ipc structure if found or NULL * otherwise. If key is found ipc points to the owning ipc structure * * Called with writer ipc_ids.rwsem held. */ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) { struct kern_ipc_perm *ipcp; ipcp = rhashtable_lookup_fast(&ids->key_ht, &key, ipc_kht_params); if (!ipcp) return NULL; rcu_read_lock(); ipc_lock_object(ipcp); return ipcp; } /* * Insert new IPC object into idr tree, and set sequence number and id * in the correct order. * Especially: * - the sequence number must be set before inserting the object into the idr, * because the sequence number is accessed without a lock. * - the id can/must be set after inserting the object into the idr. * All accesses must be done after getting kern_ipc_perm.lock. * * The caller must own kern_ipc_perm.lock.of the new object. * On error, the function returns a (negative) error code. * * To conserve sequence number space, especially with extended ipc_mni, * the sequence number is incremented only when the returned ID is less than * the last one. */ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { int idx, next_id = -1; #ifdef CONFIG_CHECKPOINT_RESTORE next_id = ids->next_id; ids->next_id = -1; #endif /* * As soon as a new object is inserted into the idr, * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it, * and the lockless preparations for ipc operations can start. * This means especially: permission checks, audit calls, allocation * of undo structures, ... * * Thus the object must be fully initialized, and if something fails, * then the full tear-down sequence must be followed. * (i.e.: set new->deleted, reduce refcount, call_rcu()) */ if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */ int max_idx; max_idx = max(ids->in_use*3/2, ipc_min_cycle); max_idx = min(max_idx, ipc_mni); /* allocate the idx, with a NULL struct kern_ipc_perm */ idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx, GFP_NOWAIT); if (idx >= 0) { /* * idx got allocated successfully. * Now calculate the sequence number and set the * pointer for real. */ if (idx <= ids->last_idx) { ids->seq++; if (ids->seq >= ipcid_seq_max()) ids->seq = 0; } ids->last_idx = idx; new->seq = ids->seq; /* no need for smp_wmb(), this is done * inside idr_replace, as part of * rcu_assign_pointer */ idr_replace(&ids->ipcs_idr, new, idx); } } else { new->seq = ipcid_to_seqx(next_id); idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), 0, GFP_NOWAIT); } if (idx >= 0) new->id = (new->seq << ipcmni_seq_shift()) + idx; return idx; } /** * ipc_addid - add an ipc identifier * @ids: ipc identifier set * @new: new ipc permission set * @limit: limit for the number of used ids * * Add an entry 'new' to the ipc ids idr. The permissions object is * initialised and the first free entry is set up and the index assigned * is returned. The 'new' entry is returned in a locked state on success. * * On failure the entry is not locked and a negative err-code is returned. * The caller must use ipc_rcu_putref() to free the identifier. * * Called with writer ipc_ids.rwsem held. */ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) { kuid_t euid; kgid_t egid; int idx, err; /* 1) Initialize the refcount so that ipc_rcu_putref works */ refcount_set(&new->refcount, 1); if (limit > ipc_mni) limit = ipc_mni; if (ids->in_use >= limit) return -ENOSPC; idr_preload(GFP_KERNEL); spin_lock_init(&new->lock); rcu_read_lock(); spin_lock(&new->lock); current_euid_egid(&euid, &egid); new->cuid = new->uid = euid; new->gid = new->cgid = egid; new->deleted = false; idx = ipc_idr_alloc(ids, new); idr_preload_end(); if (idx >= 0 && new->key != IPC_PRIVATE) { err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode, ipc_kht_params); if (err < 0) { idr_remove(&ids->ipcs_idr, idx); idx = err; } } if (idx < 0) { new->deleted = true; spin_unlock(&new->lock); rcu_read_unlock(); return idx; } ids->in_use++; if (idx > ids->max_idx) ids->max_idx = idx; return idx; } /** * ipcget_new - create a new ipc object * @ns: ipc namespace * @ids: ipc identifier set * @ops: the actual creation routine to call * @params: its parameters * * This routine is called by sys_msgget, sys_semget() and sys_shmget() * when the key is IPC_PRIVATE. */ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { int err; down_write(&ids->rwsem); err = ops->getnew(ns, params); up_write(&ids->rwsem); return err; } /** * ipc_check_perms - check security and permissions for an ipc object * @ns: ipc namespace * @ipcp: ipc permission set * @ops: the actual security routine to call * @params: its parameters * * This routine is called by sys_msgget(), sys_semget() and sys_shmget() * when the key is not IPC_PRIVATE and that key already exists in the * ds IDR. * * On success, the ipc id is returned. * * It is called with ipc_ids.rwsem and ipcp->lock held. */ static int ipc_check_perms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, const struct ipc_ops *ops, struct ipc_params *params) { int err; if (ipcperms(ns, ipcp, params->flg)) err = -EACCES; else { err = ops->associate(ipcp, params->flg); if (!err) err = ipcp->id; } return err; } /** * ipcget_public - get an ipc object or create a new one * @ns: ipc namespace * @ids: ipc identifier set * @ops: the actual creation routine to call * @params: its parameters * * This routine is called by sys_msgget, sys_semget() and sys_shmget() * when the key is not IPC_PRIVATE. * It adds a new entry if the key is not found and does some permission * / security checkings if the key is found. * * On success, the ipc id is returned. */ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { struct kern_ipc_perm *ipcp; int flg = params->flg; int err; /* * Take the lock as a writer since we are potentially going to add * a new entry + read locks are not "upgradable" */ down_write(&ids->rwsem); ipcp = ipc_findkey(ids, params->key); if (ipcp == NULL) { /* key not used */ if (!(flg & IPC_CREAT)) err = -ENOENT; else err = ops->getnew(ns, params); } else { /* ipc object has been locked by ipc_findkey() */ if (flg & IPC_CREAT && flg & IPC_EXCL) err = -EEXIST; else { err = 0; if (ops->more_checks) err = ops->more_checks(ipcp, params); if (!err) /* * ipc_check_perms returns the IPC id on * success */ err = ipc_check_perms(ns, ipcp, ops, params); } ipc_unlock(ipcp); } up_write(&ids->rwsem); return err; } /** * ipc_kht_remove - remove an ipc from the key hashtable * @ids: ipc identifier set * @ipcp: ipc perm structure containing the key to remove * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { if (ipcp->key != IPC_PRIVATE) WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, ipc_kht_params)); } /** * ipc_search_maxidx - search for the highest assigned index * @ids: ipc identifier set * @limit: known upper limit for highest assigned index * * The function determines the highest assigned index in @ids. It is intended * to be called when ids->max_idx needs to be updated. * Updating ids->max_idx is necessary when the current highest index ipc * object is deleted. * If no ipc object is allocated, then -1 is returned. * * ipc_ids.rwsem needs to be held by the caller. */ static int ipc_search_maxidx(struct ipc_ids *ids, int limit) { int tmpidx; int i; int retval; i = ilog2(limit+1); retval = 0; for (; i >= 0; i--) { tmpidx = retval | (1<<i); /* * "0" is a possible index value, thus search using * e.g. 15,7,3,1,0 instead of 16,8,4,2,1. */ tmpidx = tmpidx-1; if (idr_get_next(&ids->ipcs_idr, &tmpidx)) retval |= (1<<i); } return retval - 1; } /** * ipc_rmid - remove an ipc identifier * @ids: ipc identifier set * @ipcp: ipc perm structure containing the identifier to remove * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { int idx = ipcid_to_idx(ipcp->id); WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; if (unlikely(idx == ids->max_idx)) { idx = ids->max_idx-1; if (idx >= 0) idx = ipc_search_maxidx(ids, idx); ids->max_idx = idx; } } /** * ipc_set_key_private - switch the key of an existing ipc to IPC_PRIVATE * @ids: ipc identifier set * @ipcp: ipc perm structure containing the key to modify * * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { ipc_kht_remove(ids, ipcp); ipcp->key = IPC_PRIVATE; } bool ipc_rcu_getref(struct kern_ipc_perm *ptr) { return refcount_inc_not_zero(&ptr->refcount); } void ipc_rcu_putref(struct kern_ipc_perm *ptr, void (*func)(struct rcu_head *head)) { if (!refcount_dec_and_test(&ptr->refcount)) return; call_rcu(&ptr->rcu, func); } /** * ipcperms - check ipc permissions * @ns: ipc namespace * @ipcp: ipc permission set * @flag: desired permission set * * Check user, group, other permissions for access * to ipc resources. return 0 if allowed * * @flag will most probably be 0 or ``S_...UGO`` from <linux/stat.h> */ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag) { kuid_t euid = current_euid(); int requested_mode, granted_mode; audit_ipc_obj(ipcp); requested_mode = (flag >> 6) | (flag >> 3) | flag; granted_mode = ipcp->mode; if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid)) granted_mode >>= 6; else if (in_group_p(ipcp->cgid) || in_group_p(ipcp->gid)) granted_mode >>= 3; /* is there some bit set in requested_mode but not in granted_mode? */ if ((requested_mode & ~granted_mode & 0007) && !ns_capable(ns->user_ns, CAP_IPC_OWNER)) return -1; return security_ipc_permission(ipcp, flag); } /* * Functions to convert between the kern_ipc_perm structure and the * old/new ipc_perm structures */ /** * kernel_to_ipc64_perm - convert kernel ipc permissions to user * @in: kernel permissions * @out: new style ipc permissions * * Turn the kernel object @in into a set of permissions descriptions * for returning to userspace (@out). */ void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out) { out->key = in->key; out->uid = from_kuid_munged(current_user_ns(), in->uid); out->gid = from_kgid_munged(current_user_ns(), in->gid); out->cuid = from_kuid_munged(current_user_ns(), in->cuid); out->cgid = from_kgid_munged(current_user_ns(), in->cgid); out->mode = in->mode; out->seq = in->seq; } /** * ipc64_perm_to_ipc_perm - convert new ipc permissions to old * @in: new style ipc permissions * @out: old style ipc permissions * * Turn the new style permissions object @in into a compatibility * object and store it into the @out pointer. */ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) { out->key = in->key; SET_UID(out->uid, in->uid); SET_GID(out->gid, in->gid); SET_UID(out->cuid, in->cuid); SET_GID(out->cgid, in->cgid); out->mode = in->mode; out->seq = in->seq; } /** * ipc_obtain_object_idr * @ids: ipc identifier set * @id: ipc id to look for * * Look for an id in the ipc ids idr and return associated ipc object. * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out; int idx = ipcid_to_idx(id); out = idr_find(&ids->ipcs_idr, idx); if (!out) return ERR_PTR(-EINVAL); return out; } /** * ipc_obtain_object_check * @ids: ipc identifier set * @id: ipc id to look for * * Similar to ipc_obtain_object_idr() but also checks the ipc object * sequence number. * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out = ipc_obtain_object_idr(ids, id); if (IS_ERR(out)) goto out; if (ipc_checkid(out, id)) return ERR_PTR(-EINVAL); out: return out; } /** * ipcget - Common sys_*get() code * @ns: namespace * @ids: ipc identifier set * @ops: operations to be called on ipc object creation, permission checks * and further checks * @params: the parameters needed by the previous operations. * * Common routine called by sys_msgget(), sys_semget() and sys_shmget(). */ int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, const struct ipc_ops *ops, struct ipc_params *params) { if (params->key == IPC_PRIVATE) return ipcget_new(ns, ids, ops, params); else return ipcget_public(ns, ids, ops, params); } /** * ipc_update_perm - update the permissions of an ipc object * @in: the permission given as input. * @out: the permission of the ipc to set. */ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) { kuid_t uid = make_kuid(current_user_ns(), in->uid); kgid_t gid = make_kgid(current_user_ns(), in->gid); if (!uid_valid(uid) || !gid_valid(gid)) return -EINVAL; out->uid = uid; out->gid = gid; out->mode = (out->mode & ~S_IRWXUGO) | (in->mode & S_IRWXUGO); return 0; } /** * ipcctl_obtain_check - retrieve an ipc object and check permissions * @ns: ipc namespace * @ids: the table of ids where to look for the ipc * @id: the id of the ipc to retrieve * @cmd: the cmd to check * @perm: the permission to set * @extra_perm: one extra permission parameter used by msq * * This function does some common audit and permissions check for some IPC_XXX * cmd and is called from semctl_down, shmctl_down and msgctl_down. * * It: * - retrieves the ipc object with the given id in the given table. * - performs some audit and permission check, depending on the given cmd * - returns a pointer to the ipc object or otherwise, the corresponding * error. * * Call holding the both the rwsem and the rcu read lock. */ struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm) { kuid_t euid; int err = -EPERM; struct kern_ipc_perm *ipcp; ipcp = ipc_obtain_object_check(ids, id); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto err; } audit_ipc_obj(ipcp); if (cmd == IPC_SET) audit_ipc_set_perm(extra_perm, perm->uid, perm->gid, perm->mode); euid = current_euid(); if (uid_eq(euid, ipcp->cuid) || uid_eq(euid, ipcp->uid) || ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ipcp; /* successful lookup */ err: return ERR_PTR(err); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION /** * ipc_parse_version - ipc call version * @cmd: pointer to command * * Return IPC_64 for new style IPC and IPC_OLD for old style IPC. * The @cmd value is turned from an encoding command and version into * just the command code. */ int ipc_parse_version(int *cmd) { if (*cmd & IPC_64) { *cmd ^= IPC_64; return IPC_64; } else { return IPC_OLD; } } #endif /* CONFIG_ARCH_WANT_IPC_PARSE_VERSION */ #ifdef CONFIG_PROC_FS struct ipc_proc_iter { struct ipc_namespace *ns; struct pid_namespace *pid_ns; struct ipc_proc_iface *iface; }; struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) { struct ipc_proc_iter *iter = s->private; return iter->pid_ns; } /** * sysvipc_find_ipc - Find and lock the ipc structure based on seq pos * @ids: ipc identifier set * @pos: expected position * * The function finds an ipc structure, based on the sequence file * position @pos. If there is no ipc structure at position @pos, then * the successor is selected. * If a structure is found, then it is locked (both rcu_read_lock() and * ipc_lock_object()) and @pos is set to the position needed to locate * the found ipc structure. * If nothing is found (i.e. EOF), @pos is not modified. * * The function returns the found ipc structure, or NULL at EOF. */ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t *pos) { int tmpidx; struct kern_ipc_perm *ipc; /* convert from position to idr index -> "-1" */ tmpidx = *pos - 1; ipc = idr_get_next(&ids->ipcs_idr, &tmpidx); if (ipc != NULL) { rcu_read_lock(); ipc_lock_object(ipc); /* convert from idr index to position -> "+1" */ *pos = tmpidx + 1; } return ipc; } static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct kern_ipc_perm *ipc = it; /* If we had an ipc id locked before, unlock it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); /* Next -> search for *pos+1 */ (*pos)++; return sysvipc_find_ipc(&iter->ns->ids[iface->ids], pos); } /* * File positions: pos 0 -> header, pos n -> ipc idx = n - 1. * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START. */ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct ipc_ids *ids; ids = &iter->ns->ids[iface->ids]; /* * Take the lock - this will be released by the corresponding * call to stop(). */ down_read(&ids->rwsem); /* pos < 0 is invalid */ if (*pos < 0) return NULL; /* pos == 0 means header */ if (*pos == 0) return SEQ_START_TOKEN; /* Otherwise return the correct ipc structure */ return sysvipc_find_ipc(ids, pos); } static void sysvipc_proc_stop(struct seq_file *s, void *it) { struct kern_ipc_perm *ipc = it; struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; struct ipc_ids *ids; /* If we had a locked structure, release it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); ids = &iter->ns->ids[iface->ids]; /* Release the lock we took in start() */ up_read(&ids->rwsem); } static int sysvipc_proc_show(struct seq_file *s, void *it) { struct ipc_proc_iter *iter = s->private; struct ipc_proc_iface *iface = iter->iface; if (it == SEQ_START_TOKEN) { seq_puts(s, iface->header); return 0; } return iface->show(s, it); } static const struct seq_operations sysvipc_proc_seqops = { .start = sysvipc_proc_start, .stop = sysvipc_proc_stop, .next = sysvipc_proc_next, .show = sysvipc_proc_show, }; static int sysvipc_proc_open(struct inode *inode, struct file *file) { struct ipc_proc_iter *iter; iter = __seq_open_private(file, &sysvipc_proc_seqops, sizeof(*iter)); if (!iter) return -ENOMEM; iter->iface = pde_data(inode); iter->ns = get_ipc_ns(current->nsproxy->ipc_ns); iter->pid_ns = get_pid_ns(task_active_pid_ns(current)); return 0; } static int sysvipc_proc_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct ipc_proc_iter *iter = seq->private; put_ipc_ns(iter->ns); put_pid_ns(iter->pid_ns); return seq_release_private(inode, file); } static const struct proc_ops sysvipc_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = sysvipc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = sysvipc_proc_release, }; #endif /* CONFIG_PROC_FS */
11 11 4 4 4 4 4 4 4 4 4 4 4 36 36 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2006, Johannes Berg <johannes@sipsolutions.net> */ /* just for IFNAMSIZ */ #include <linux/if.h> #include <linux/slab.h> #include <linux/export.h> #include "led.h" void ieee80211_led_assoc(struct ieee80211_local *local, bool associated) { if (!atomic_read(&local->assoc_led_active)) return; if (associated) led_trigger_event(&local->assoc_led, LED_FULL); else led_trigger_event(&local->assoc_led, LED_OFF); } void ieee80211_led_radio(struct ieee80211_local *local, bool enabled) { if (!atomic_read(&local->radio_led_active)) return; if (enabled) led_trigger_event(&local->radio_led, LED_FULL); else led_trigger_event(&local->radio_led, LED_OFF); } void ieee80211_alloc_led_names(struct ieee80211_local *local) { local->rx_led.name = kasprintf(GFP_KERNEL, "%srx", wiphy_name(local->hw.wiphy)); local->tx_led.name = kasprintf(GFP_KERNEL, "%stx", wiphy_name(local->hw.wiphy)); local->assoc_led.name = kasprintf(GFP_KERNEL, "%sassoc", wiphy_name(local->hw.wiphy)); local->radio_led.name = kasprintf(GFP_KERNEL, "%sradio", wiphy_name(local->hw.wiphy)); } void ieee80211_free_led_names(struct ieee80211_local *local) { kfree(local->rx_led.name); kfree(local->tx_led.name); kfree(local->assoc_led.name); kfree(local->radio_led.name); } static int ieee80211_tx_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tx_led); atomic_inc(&local->tx_led_active); return 0; } static void ieee80211_tx_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tx_led); atomic_dec(&local->tx_led_active); } static int ieee80211_rx_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, rx_led); atomic_inc(&local->rx_led_active); return 0; } static void ieee80211_rx_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, rx_led); atomic_dec(&local->rx_led_active); } static int ieee80211_assoc_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, assoc_led); atomic_inc(&local->assoc_led_active); return 0; } static void ieee80211_assoc_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, assoc_led); atomic_dec(&local->assoc_led_active); } static int ieee80211_radio_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, radio_led); atomic_inc(&local->radio_led_active); return 0; } static void ieee80211_radio_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, radio_led); atomic_dec(&local->radio_led_active); } static int ieee80211_tpt_led_activate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tpt_led); atomic_inc(&local->tpt_led_active); return 0; } static void ieee80211_tpt_led_deactivate(struct led_classdev *led_cdev) { struct ieee80211_local *local = container_of(led_cdev->trigger, struct ieee80211_local, tpt_led); atomic_dec(&local->tpt_led_active); } void ieee80211_led_init(struct ieee80211_local *local) { atomic_set(&local->rx_led_active, 0); local->rx_led.activate = ieee80211_rx_led_activate; local->rx_led.deactivate = ieee80211_rx_led_deactivate; if (local->rx_led.name && led_trigger_register(&local->rx_led)) { kfree(local->rx_led.name); local->rx_led.name = NULL; } atomic_set(&local->tx_led_active, 0); local->tx_led.activate = ieee80211_tx_led_activate; local->tx_led.deactivate = ieee80211_tx_led_deactivate; if (local->tx_led.name && led_trigger_register(&local->tx_led)) { kfree(local->tx_led.name); local->tx_led.name = NULL; } atomic_set(&local->assoc_led_active, 0); local->assoc_led.activate = ieee80211_assoc_led_activate; local->assoc_led.deactivate = ieee80211_assoc_led_deactivate; if (local->assoc_led.name && led_trigger_register(&local->assoc_led)) { kfree(local->assoc_led.name); local->assoc_led.name = NULL; } atomic_set(&local->radio_led_active, 0); local->radio_led.activate = ieee80211_radio_led_activate; local->radio_led.deactivate = ieee80211_radio_led_deactivate; if (local->radio_led.name && led_trigger_register(&local->radio_led)) { kfree(local->radio_led.name); local->radio_led.name = NULL; } atomic_set(&local->tpt_led_active, 0); if (local->tpt_led_trigger) { local->tpt_led.activate = ieee80211_tpt_led_activate; local->tpt_led.deactivate = ieee80211_tpt_led_deactivate; if (led_trigger_register(&local->tpt_led)) { kfree(local->tpt_led_trigger); local->tpt_led_trigger = NULL; } } } void ieee80211_led_exit(struct ieee80211_local *local) { if (local->radio_led.name) led_trigger_unregister(&local->radio_led); if (local->assoc_led.name) led_trigger_unregister(&local->assoc_led); if (local->tx_led.name) led_trigger_unregister(&local->tx_led); if (local->rx_led.name) led_trigger_unregister(&local->rx_led); if (local->tpt_led_trigger) { led_trigger_unregister(&local->tpt_led); kfree(local->tpt_led_trigger); } } const char *__ieee80211_get_radio_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->radio_led.name; } EXPORT_SYMBOL(__ieee80211_get_radio_led_name); const char *__ieee80211_get_assoc_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->assoc_led.name; } EXPORT_SYMBOL(__ieee80211_get_assoc_led_name); const char *__ieee80211_get_tx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->tx_led.name; } EXPORT_SYMBOL(__ieee80211_get_tx_led_name); const char *__ieee80211_get_rx_led_name(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); return local->rx_led.name; } EXPORT_SYMBOL(__ieee80211_get_rx_led_name); static unsigned long tpt_trig_traffic(struct ieee80211_local *local, struct tpt_led_trigger *tpt_trig) { unsigned long traffic, delta; traffic = tpt_trig->tx_bytes + tpt_trig->rx_bytes; delta = traffic - tpt_trig->prev_traffic; tpt_trig->prev_traffic = traffic; return DIV_ROUND_UP(delta, 1024 / 8); } static void tpt_trig_timer(struct timer_list *t) { struct tpt_led_trigger *tpt_trig = from_timer(tpt_trig, t, timer); struct ieee80211_local *local = tpt_trig->local; unsigned long on, off, tpt; int i; if (!tpt_trig->running) return; mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); tpt = tpt_trig_traffic(local, tpt_trig); /* default to just solid on */ on = 1; off = 0; for (i = tpt_trig->blink_table_len - 1; i >= 0; i--) { if (tpt_trig->blink_table[i].throughput < 0 || tpt > tpt_trig->blink_table[i].throughput) { off = tpt_trig->blink_table[i].blink_time / 2; on = tpt_trig->blink_table[i].blink_time - off; break; } } led_trigger_blink(&local->tpt_led, on, off); } const char * __ieee80211_create_tpt_led_trigger(struct ieee80211_hw *hw, unsigned int flags, const struct ieee80211_tpt_blink *blink_table, unsigned int blink_table_len) { struct ieee80211_local *local = hw_to_local(hw); struct tpt_led_trigger *tpt_trig; if (WARN_ON(local->tpt_led_trigger)) return NULL; tpt_trig = kzalloc(sizeof(struct tpt_led_trigger), GFP_KERNEL); if (!tpt_trig) return NULL; snprintf(tpt_trig->name, sizeof(tpt_trig->name), "%stpt", wiphy_name(local->hw.wiphy)); local->tpt_led.name = tpt_trig->name; tpt_trig->blink_table = blink_table; tpt_trig->blink_table_len = blink_table_len; tpt_trig->want = flags; tpt_trig->local = local; timer_setup(&tpt_trig->timer, tpt_trig_timer, 0); local->tpt_led_trigger = tpt_trig; return tpt_trig->name; } EXPORT_SYMBOL(__ieee80211_create_tpt_led_trigger); static void ieee80211_start_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; if (tpt_trig->running) return; /* reset traffic */ tpt_trig_traffic(local, tpt_trig); tpt_trig->running = true; tpt_trig_timer(&tpt_trig->timer); mod_timer(&tpt_trig->timer, round_jiffies(jiffies + HZ)); } static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; if (!tpt_trig->running) return; tpt_trig->running = false; del_timer_sync(&tpt_trig->timer); led_trigger_event(&local->tpt_led, LED_OFF); } void ieee80211_mod_tpt_led_trig(struct ieee80211_local *local, unsigned int types_on, unsigned int types_off) { struct tpt_led_trigger *tpt_trig = local->tpt_led_trigger; bool allowed; WARN_ON(types_on & types_off); if (!tpt_trig) return; tpt_trig->active &= ~types_off; tpt_trig->active |= types_on; /* * Regardless of wanted state, we shouldn't blink when * the radio is disabled -- this can happen due to some * code ordering issues with __ieee80211_recalc_idle() * being called before the radio is started. */ allowed = tpt_trig->active & IEEE80211_TPT_LEDTRIG_FL_RADIO; if (!allowed || !(tpt_trig->active & tpt_trig->want)) ieee80211_stop_tpt_led_trig(local); else ieee80211_start_tpt_led_trig(local); }
989 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_CMND_H #define _SCSI_SCSI_CMND_H #include <linux/dma-mapping.h> #include <linux/blkdev.h> #include <linux/t10-pi.h> #include <linux/list.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/scatterlist.h> #include <scsi/scsi_device.h> struct Scsi_Host; /* * MAX_COMMAND_SIZE is: * The longest fixed-length SCSI CDB as per the SCSI standard. * fixed-length means: commands that their size can be determined * by their opcode and the CDB does not carry a length specifier, (unlike * the VARIABLE_LENGTH_CMD(0x7f) command). This is actually not exactly * true and the SCSI standard also defines extended commands and * vendor specific commands that can be bigger than 16 bytes. The kernel * will support these using the same infrastructure used for VARLEN CDB's. * So in effect MAX_COMMAND_SIZE means the maximum size command scsi-ml * supports without specifying a cmd_len by ULD's */ #define MAX_COMMAND_SIZE 16 struct scsi_data_buffer { struct sg_table table; unsigned length; }; /* embedded in scsi_cmnd */ struct scsi_pointer { char *ptr; /* data pointer */ int this_residual; /* left in this buffer */ struct scatterlist *buffer; /* which buffer */ int buffers_residual; /* how many buffers left */ dma_addr_t dma_handle; volatile int Status; volatile int Message; volatile int have_data_in; volatile int sent_command; volatile int phase; }; /* for scmd->flags */ #define SCMD_TAGGED (1 << 0) #define SCMD_INITIALIZED (1 << 1) #define SCMD_LAST (1 << 2) /* * libata uses SCSI EH to fetch sense data for successful commands. * SCSI EH should not overwrite scmd->result when SCMD_FORCE_EH_SUCCESS is set. */ #define SCMD_FORCE_EH_SUCCESS (1 << 3) #define SCMD_FAIL_IF_RECOVERING (1 << 4) /* flags preserved across unprep / reprep */ #define SCMD_PRESERVED_FLAGS (SCMD_INITIALIZED | SCMD_FAIL_IF_RECOVERING) /* for scmd->state */ #define SCMD_STATE_COMPLETE 0 #define SCMD_STATE_INFLIGHT 1 enum scsi_cmnd_submitter { SUBMITTED_BY_BLOCK_LAYER = 0, SUBMITTED_BY_SCSI_ERROR_HANDLER = 1, SUBMITTED_BY_SCSI_RESET_IOCTL = 2, } __packed; struct scsi_cmnd { struct scsi_device *device; struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */ struct delayed_work abort_work; struct rcu_head rcu; int eh_eflags; /* Used by error handlr */ int budget_token; /* * This is set to jiffies as it was when the command was first * allocated. It is used to time how long the command has * been outstanding */ unsigned long jiffies_at_alloc; int retries; int allowed; unsigned char prot_op; unsigned char prot_type; unsigned char prot_flags; enum scsi_cmnd_submitter submitter; unsigned short cmd_len; enum dma_data_direction sc_data_direction; unsigned char cmnd[32]; /* SCSI CDB */ /* These elements define the operation we ultimately want to perform */ struct scsi_data_buffer sdb; struct scsi_data_buffer *prot_sdb; unsigned underflow; /* Return error if less than this amount is transferred */ unsigned transfersize; /* How much we are guaranteed to transfer with each SCSI transfer (ie, between disconnect / reconnects. Probably == sector size */ unsigned resid_len; /* residual count */ unsigned sense_len; unsigned char *sense_buffer; /* obtained by REQUEST SENSE when * CHECK CONDITION is received on original * command (auto-sense). Length must be * SCSI_SENSE_BUFFERSIZE bytes. */ int flags; /* Command flags */ unsigned long state; /* Command completion state */ unsigned int extra_len; /* length of alignment and padding */ /* * The fields below can be modified by the LLD but the fields above * must not be modified. */ unsigned char *host_scribble; /* The host adapter is allowed to * call scsi_malloc and get some memory * and hang it here. The host adapter * is also expected to call scsi_free * to release this memory. (The memory * obtained by scsi_malloc is guaranteed * to be at an address < 16Mb). */ int result; /* Status code from lower level driver */ }; /* Variant of blk_mq_rq_from_pdu() that verifies the type of its argument. */ static inline struct request *scsi_cmd_to_rq(struct scsi_cmnd *scmd) { return blk_mq_rq_from_pdu(scmd); } /* * Return the driver private allocation behind the command. * Only works if cmd_size is set in the host template. */ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd) { return cmd + 1; } void scsi_done(struct scsi_cmnd *cmd); void scsi_done_direct(struct scsi_cmnd *cmd); extern void scsi_finish_command(struct scsi_cmnd *cmd); extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, size_t *offset, size_t *len); extern void scsi_kunmap_atomic_sg(void *virt); blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd); void scsi_free_sgtables(struct scsi_cmnd *cmd); #ifdef CONFIG_SCSI_DMA extern int scsi_dma_map(struct scsi_cmnd *cmd); extern void scsi_dma_unmap(struct scsi_cmnd *cmd); #else /* !CONFIG_SCSI_DMA */ static inline int scsi_dma_map(struct scsi_cmnd *cmd) { return -ENOSYS; } static inline void scsi_dma_unmap(struct scsi_cmnd *cmd) { } #endif /* !CONFIG_SCSI_DMA */ static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd) { return cmd->sdb.table.nents; } static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd) { return cmd->sdb.table.sgl; } static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd) { return cmd->sdb.length; } static inline void scsi_set_resid(struct scsi_cmnd *cmd, unsigned int resid) { cmd->resid_len = resid; } static inline unsigned int scsi_get_resid(struct scsi_cmnd *cmd) { return cmd->resid_len; } #define scsi_for_each_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_sglist(cmd), sg, nseg, __i) static inline int scsi_sg_copy_from_buffer(struct scsi_cmnd *cmd, const void *buf, int buflen) { return sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, buflen); } static inline int scsi_sg_copy_to_buffer(struct scsi_cmnd *cmd, void *buf, int buflen) { return sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, buflen); } static inline sector_t scsi_get_sector(struct scsi_cmnd *scmd) { return blk_rq_pos(scsi_cmd_to_rq(scmd)); } static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd) { unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT; return blk_rq_pos(scsi_cmd_to_rq(scmd)) >> shift; } static inline unsigned int scsi_logical_block_count(struct scsi_cmnd *scmd) { unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT; return blk_rq_bytes(scsi_cmd_to_rq(scmd)) >> shift; } /* * The operations below are hints that tell the controller driver how * to handle I/Os with DIF or similar types of protection information. */ enum scsi_prot_operations { /* Normal I/O */ SCSI_PROT_NORMAL = 0, /* OS-HBA: Protected, HBA-Target: Unprotected */ SCSI_PROT_READ_INSERT, SCSI_PROT_WRITE_STRIP, /* OS-HBA: Unprotected, HBA-Target: Protected */ SCSI_PROT_READ_STRIP, SCSI_PROT_WRITE_INSERT, /* OS-HBA: Protected, HBA-Target: Protected */ SCSI_PROT_READ_PASS, SCSI_PROT_WRITE_PASS, }; static inline void scsi_set_prot_op(struct scsi_cmnd *scmd, unsigned char op) { scmd->prot_op = op; } static inline unsigned char scsi_get_prot_op(struct scsi_cmnd *scmd) { return scmd->prot_op; } enum scsi_prot_flags { SCSI_PROT_TRANSFER_PI = 1 << 0, SCSI_PROT_GUARD_CHECK = 1 << 1, SCSI_PROT_REF_CHECK = 1 << 2, SCSI_PROT_REF_INCREMENT = 1 << 3, SCSI_PROT_IP_CHECKSUM = 1 << 4, }; /* * The controller usually does not know anything about the target it * is communicating with. However, when DIX is enabled the controller * must be know target type so it can verify the protection * information passed along with the I/O. */ enum scsi_prot_target_type { SCSI_PROT_DIF_TYPE0 = 0, SCSI_PROT_DIF_TYPE1, SCSI_PROT_DIF_TYPE2, SCSI_PROT_DIF_TYPE3, }; static inline void scsi_set_prot_type(struct scsi_cmnd *scmd, unsigned char type) { scmd->prot_type = type; } static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd) { return scmd->prot_type; } static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd) { struct request *rq = blk_mq_rq_from_pdu(scmd); return t10_pi_ref_tag(rq); } static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd) { return scmd->device->sector_size; } static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; } static inline struct scatterlist *scsi_prot_sglist(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.sgl : NULL; } static inline struct scsi_data_buffer *scsi_prot(struct scsi_cmnd *cmd) { return cmd->prot_sdb; } #define scsi_for_each_prot_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_prot_sglist(cmd), sg, nseg, __i) static inline void set_status_byte(struct scsi_cmnd *cmd, char status) { cmd->result = (cmd->result & 0xffffff00) | status; } static inline u8 get_status_byte(struct scsi_cmnd *cmd) { return cmd->result & 0xff; } static inline void set_host_byte(struct scsi_cmnd *cmd, char status) { cmd->result = (cmd->result & 0xff00ffff) | (status << 16); } static inline u8 get_host_byte(struct scsi_cmnd *cmd) { return (cmd->result >> 16) & 0xff; } /** * scsi_msg_to_host_byte() - translate message byte * * Translate the SCSI parallel message byte to a matching * host byte setting. A message of COMMAND_COMPLETE indicates * a successful command execution, any other message indicate * an error. As the messages themselves only have a meaning * for the SCSI parallel protocol this function translates * them into a matching host byte value for SCSI EH. */ static inline void scsi_msg_to_host_byte(struct scsi_cmnd *cmd, u8 msg) { switch (msg) { case COMMAND_COMPLETE: break; case ABORT_TASK_SET: set_host_byte(cmd, DID_ABORT); break; case TARGET_RESET: set_host_byte(cmd, DID_RESET); break; default: set_host_byte(cmd, DID_ERROR); break; } } static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd) { unsigned int xfer_len = scmd->sdb.length; unsigned int prot_interval = scsi_prot_interval(scmd); if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI) xfer_len += (xfer_len >> ilog2(prot_interval)) * 8; return xfer_len; } extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc, u8 key, u8 asc, u8 ascq); struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags); #endif /* _SCSI_SCSI_CMND_H */
6 3 3 2 1 1 9 2 7 4 3 3 3 1 3 2 2 2 3 1 1 4 4 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_mq.c Classful multiqueue dummy scheduler * * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> struct mq_sched { struct Qdisc **qdiscs; }; static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd) { struct net_device *dev = qdisc_dev(sch); struct tc_mq_qopt_offload opt = { .command = cmd, .handle = sch->handle, }; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt); } static int mq_offload_stats(struct Qdisc *sch) { struct tc_mq_qopt_offload opt = { .command = TC_MQ_STATS, .handle = sch->handle, .stats = { .bstats = &sch->bstats, .qstats = &sch->qstats, }, }; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt); } static void mq_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); unsigned int ntx; mq_offload(sch, TC_MQ_DESTROY); if (!priv->qdiscs) return; for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) qdisc_put(priv->qdiscs[ntx]); kfree(priv->qdiscs); } static int mq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); struct netdev_queue *dev_queue; struct Qdisc *qdisc; unsigned int ntx; if (sch->parent != TC_H_ROOT) return -EOPNOTSUPP; if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; /* pre-allocate qdiscs, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); if (!priv->qdiscs) return -ENOMEM; for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1)), extack); if (!qdisc) return -ENOMEM; priv->qdiscs[ntx] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->flags |= TCQ_F_MQROOT; mq_offload(sch, TC_MQ_CREATE); return 0; } static void mq_attach(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); struct Qdisc *qdisc, *old; unsigned int ntx; for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = priv->qdiscs[ntx]; old = dev_graft_qdisc(qdisc->dev_queue, qdisc); if (old) qdisc_put(old); #ifdef CONFIG_NET_SCHED if (ntx < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); #endif } kfree(priv->qdiscs); priv->qdiscs = NULL; } static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int ntx; sch->q.qlen = 0; gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs * to account for all, none, or a mix of locked and unlocked child * qdiscs. Percpu stats are added to counters in-band and locking * qdisc totals are added at end. */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); spin_lock_bh(qdisc_lock(qdisc)); gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, &qdisc->bstats, false); gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, &qdisc->qstats); sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } return mq_offload_stats(sch); } static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static struct netdev_queue *mq_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return mq_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); struct tc_mq_qopt_offload graft_offload; struct net_device *dev = qdisc_dev(sch); if (dev->flags & IFF_UP) dev_deactivate(dev); *old = dev_graft_qdisc(dev_queue, new); if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); graft_offload.handle = sch->handle; graft_offload.graft_params.queue = cl - 1; graft_offload.graft_params.child_handle = new ? new->handle : 0; graft_offload.command = TC_MQ_GRAFT; qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, TC_SETUP_QDISC_MQ, &graft_offload, extack); return 0; } static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); return rtnl_dereference(dev_queue->qdisc_sleeping); } static unsigned long mq_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); if (!mq_queue_get(sch, ntx)) return 0; return ntx; } static int mq_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle; return 0; } static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = rtnl_dereference(dev_queue->qdisc_sleeping); if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; } static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned int ntx; if (arg->stop) return; arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; } } static const struct Qdisc_class_ops mq_class_ops = { .select_queue = mq_select_queue, .graft = mq_graft, .leaf = mq_leaf, .find = mq_find, .walk = mq_walk, .dump = mq_dump_class, .dump_stats = mq_dump_class_stats, }; struct Qdisc_ops mq_qdisc_ops __read_mostly = { .cl_ops = &mq_class_ops, .id = "mq", .priv_size = sizeof(struct mq_sched), .init = mq_init, .destroy = mq_destroy, .attach = mq_attach, .change_real_num_tx = mq_change_real_num_tx, .dump = mq_dump, .owner = THIS_MODULE, };
2 10 10 24 24 172 172 5 5 5 5 5 24 19 4 1 5 5 24 5 19 28 6 24 22 2 24 2 24 24 24 24 24 10 10 14 2 9 3 1 8 46 46 46 46 33 14 34 20 3 5 2 27 29 5 27 14 14 11 11 11 11 5 5 34 34 34 34 34 34 34 34 18 18 15 2 2 17 17 17 17 17 17 20 1 2 2 17 17 17 17 48 3 45 3 40 2 1 35 24 4 20 7 7 7 7 1 1 6 6 5 1 6 7 7 2 5 2 3 4 6 4 4 18 1 9 8 12 4 3 3 3 1 29 7 28 29 9 20 2 20 4 1 2 1 3 1 1 1 1 1 1 2 1 1 99 2 5 92 81 21 12 82 32 41 4 7 1 1 1 1 4 25 4 20 19 5 1 1 2 1 1 2 1 1 6 2 2 1 1 11 1 10 11 438 438 437 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * AF_SMC protocol family socket handler keeping the AF_INET sock address type * applies to SOCK_STREAM sockets only * offers an alternative communication option for TCP-protocol sockets * applicable with RoCE-cards only * * Initial restrictions: * - support for alternate links postponed * * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> * based on prototype from Frank Blaschka */ #define KMSG_COMPONENT "smc" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/socket.h> #include <linux/workqueue.h> #include <linux/in.h> #include <linux/sched/signal.h> #include <linux/if_vlan.h> #include <linux/rcupdate_wait.h> #include <linux/ctype.h> #include <linux/splice.h> #include <net/sock.h> #include <net/tcp.h> #include <net/smc.h> #include <asm/ioctls.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include "smc_netns.h" #include "smc.h" #include "smc_clc.h" #include "smc_llc.h" #include "smc_cdc.h" #include "smc_core.h" #include "smc_ib.h" #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server */ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); void *hdr; if (cb_ctx->pos[0]) goto out; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_DUMP_HS_LIMITATION); if (!hdr) return -ENOMEM; if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED, sock_net(skb->sk)->smc.limit_smc_hs)) goto err; genlmsg_end(skb, hdr); cb_ctx->pos[0] = 1; out: return skb->len; err: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info) { sock_net(skb->sk)->smc.limit_smc_hs = true; return 0; } int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info) { sock_net(skb->sk)->smc.limit_smc_hs = false; return 0; } static void smc_set_keepalive(struct sock *sk, int val) { struct smc_sock *smc = smc_sk(sk); smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); } static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct smc_sock *smc; struct sock *child; smc = smc_clcsock_user_data(sk); if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > sk->sk_max_ack_backlog) goto drop; if (sk_acceptq_is_full(&smc->sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; } /* passthrough to original syn recv sock fct */ child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); /* child must not inherit smc or its ops */ if (child) { rcu_assign_sk_user_data(child, NULL); /* v4-mapped sockets don't inherit parent ops. Don't restore. */ if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops) inet_csk(child)->icsk_af_ops = smc->ori_af_ops; } return child; drop: dst_release(dst); tcp_listendrop(sk); return NULL; } static bool smc_hs_congested(const struct sock *sk) { const struct smc_sock *smc; smc = smc_clcsock_user_data(sk); if (!smc) return true; if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) return true; return false; } static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; static struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), }; int smc_hash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; struct hlist_head *head; head = &h->ht; write_lock_bh(&h->lock); sk_add_node(sk, head); write_unlock_bh(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } EXPORT_SYMBOL_GPL(smc_hash_sk); void smc_unhash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; write_lock_bh(&h->lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&h->lock); } EXPORT_SYMBOL_GPL(smc_unhash_sk); /* This will be called before user really release sock_lock. So do the * work which we didn't do because of user hold the sock_lock in the * BH context */ static void smc_release_cb(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); if (smc->conn.tx_in_release_sock) { smc_tx_pending(&smc->conn); smc->conn.tx_in_release_sock = false; } } struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto); struct proto smc_proto6 = { .name = "SMC6", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto6); static void smc_fback_restore_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = NULL; smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); write_unlock_bh(&clcsk->sk_callback_lock); } static void smc_restore_fallback_changes(struct smc_sock *smc) { if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ smc->clcsock->file->private_data = smc->sk.sk_socket; smc->clcsock->file = NULL; smc_fback_restore_callbacks(smc); } } static int __smc_release(struct smc_sock *smc) { struct sock *sk = &smc->sk; int rc = 0; if (!smc->use_fallback) { rc = smc_close_active(smc); smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { if (sk->sk_state != SMC_CLOSED) { if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) sock_put(sk); /* passive closing */ if (sk->sk_state == SMC_LISTEN) { /* wake up clcsock accept */ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); } smc_restore_fallback_changes(smc); } sk->sk_prot->unhash(sk); if (sk->sk_state == SMC_CLOSED) { if (smc->clcsock) { release_sock(sk); smc_clcsock_release(smc); lock_sock(sk); } if (!smc->use_fallback) smc_conn_free(&smc->conn); } return rc; } static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; int old_state, rc = 0; if (!sk) goto out; sock_hold(sk); /* sock_put below */ smc = smc_sk(sk); old_state = sk->sk_state; /* cleanup for a dangling non-blocking connect */ if (smc->connect_nonblock && old_state == SMC_INIT) tcp_abort(smc->clcsock->sk, ECONNABORTED); if (cancel_work_sync(&smc->connect_work)) sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */ if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires * sock lock for child sockets again */ lock_sock_nested(sk, SINGLE_DEPTH_NESTING); else lock_sock(sk); if (old_state == SMC_INIT && sk->sk_state == SMC_ACTIVE && !smc->use_fallback) smc_close_active_abort(smc); rc = __smc_release(smc); /* detach socket */ sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ out: return rc; } static void smc_destruct(struct sock *sk) { if (sk->sk_state != SMC_CLOSED) return; if (!sock_flag(sk, SOCK_DEAD)) return; } static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, int protocol) { struct smc_sock *smc; struct proto *prot; struct sock *sk; prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); if (!sk) return NULL; sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem)); WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); return sk; } static int smc_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; struct smc_sock *smc; int rc; smc = smc_sk(sk); /* replicate tests from inet_bind(), to be safe wrt. future changes */ rc = -EINVAL; if (addr_len < sizeof(struct sockaddr_in)) goto out; rc = -EAFNOSUPPORT; if (addr->sin_family != AF_INET && addr->sin_family != AF_INET6 && addr->sin_family != AF_UNSPEC) goto out; /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ if (addr->sin_family == AF_UNSPEC && addr->sin_addr.s_addr != htonl(INADDR_ANY)) goto out; lock_sock(sk); /* Check if socket is already active */ rc = -EINVAL; if (sk->sk_state != SMC_INIT || smc->connect_nonblock) goto out_rel; smc->clcsock->sk->sk_reuse = sk->sk_reuse; smc->clcsock->sk->sk_reuseport = sk->sk_reuseport; rc = kernel_bind(smc->clcsock, uaddr, addr_len); out_rel: release_sock(sk); out: return rc; } /* copy only relevant settings and flags of SOL_SOCKET level from smc to * clc socket (since smc is not called for these options from net/core) */ #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ (1UL << SOCK_KEEPOPEN) | \ (1UL << SOCK_LINGER) | \ (1UL << SOCK_BROADCAST) | \ (1UL << SOCK_TIMESTAMP) | \ (1UL << SOCK_DBG) | \ (1UL << SOCK_RCVTSTAMP) | \ (1UL << SOCK_RCVTSTAMPNS) | \ (1UL << SOCK_LOCALROUTE) | \ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ (1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_WIFI_STATUS) | \ (1UL << SOCK_NOFCS) | \ (1UL << SOCK_FILTER_LOCKED) | \ (1UL << SOCK_TSTAMP_NEW)) /* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */ static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, unsigned long mask) { struct net *nnet = sock_net(nsk); nsk->sk_userlocks = osk->sk_userlocks; if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) { nsk->sk_sndbuf = osk->sk_sndbuf; } else { if (mask == SK_FLAGS_SMC_TO_CLC) WRITE_ONCE(nsk->sk_sndbuf, READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1])); else WRITE_ONCE(nsk->sk_sndbuf, 2 * READ_ONCE(nnet->smc.sysctl_wmem)); } if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) { nsk->sk_rcvbuf = osk->sk_rcvbuf; } else { if (mask == SK_FLAGS_SMC_TO_CLC) WRITE_ONCE(nsk->sk_rcvbuf, READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1])); else WRITE_ONCE(nsk->sk_rcvbuf, 2 * READ_ONCE(nnet->smc.sysctl_rmem)); } } static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, unsigned long mask) { /* options we don't get control via setsockopt for */ nsk->sk_type = osk->sk_type; nsk->sk_sndtimeo = osk->sk_sndtimeo; nsk->sk_rcvtimeo = osk->sk_rcvtimeo; nsk->sk_mark = READ_ONCE(osk->sk_mark); nsk->sk_priority = READ_ONCE(osk->sk_priority); nsk->sk_rcvlowat = osk->sk_rcvlowat; nsk->sk_bound_dev_if = osk->sk_bound_dev_if; nsk->sk_err = osk->sk_err; nsk->sk_flags &= ~mask; nsk->sk_flags |= osk->sk_flags & mask; smc_adjust_sock_bufsizes(nsk, osk, mask); } static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) { smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); } #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ (1UL << SOCK_KEEPOPEN) | \ (1UL << SOCK_LINGER) | \ (1UL << SOCK_DBG)) /* copy only settings and flags relevant for smc from clc to smc socket */ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) { smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } /* register the new vzalloced sndbuf on all links */ static int smcr_lgr_reg_sndbufs(struct smc_link *link, struct smc_buf_desc *snd_desc) { struct smc_link_group *lgr = link->lgr; int i, rc = 0; if (!snd_desc->is_vm) return -EINVAL; /* protect against parallel smcr_link_reg_buf() */ down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; rc = smcr_link_reg_buf(&lgr->lnk[i], snd_desc); if (rc) break; } up_write(&lgr->llc_conf_mutex); return rc; } /* register the new rmb on all links */ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) { struct smc_link_group *lgr = link->lgr; bool do_slow = false; int i, rc = 0; rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (rc) return rc; down_read(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; if (!rmb_desc->is_reg_mr[link->link_idx]) { up_read(&lgr->llc_conf_mutex); goto slow_path; } } /* mr register already */ goto fast_path; slow_path: do_slow = true; /* protect against parallel smc_llc_cli_rkey_exchange() and * parallel smcr_link_reg_buf() */ down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; rc = smcr_link_reg_buf(&lgr->lnk[i], rmb_desc); if (rc) goto out; } fast_path: /* exchange confirm_rkey msg with peer */ rc = smc_llc_do_confirm_rkey(link, rmb_desc); if (rc) { rc = -EFAULT; goto out; } rmb_desc->is_conf_rkey = true; out: do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; struct smc_llc_qentry *qentry; int rc; /* Receive CONFIRM LINK request from server over RoCE fabric. * Increasing the client's timeout by twice as much as the server's * timeout by default can temporarily avoid decline messages of * both sides crossing or colliding */ qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME, SMC_LLC_CONFIRM_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } smc_llc_save_peer_uid(qentry); rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); if (rc) return SMC_CLC_DECL_RMBE_EC; rc = smc_ib_modify_qp_rts(link); if (rc) return SMC_CLC_DECL_ERR_RDYLNK; smc_wr_remember_qp_attr(link); /* reg the sndbuf if it was vzalloced */ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } /* reg the rmb */ if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; /* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); if (link->lgr->max_links > 1) { /* optional 2nd link, receive ADD LINK request from server */ qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); if (rc == -EAGAIN) rc = 0; /* no DECLINE received, go with one link */ return rc; } smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); smc_llc_cli_add_link(link, qentry); } return 0; } static bool smc_isascii(char *hostname) { int i; for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) if (!isascii(hostname[i])) return false; return true; } static void smc_conn_save_peer_info_fce(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { struct smc_clc_first_contact_ext *fce; int clc_v2_len; if (clc->hdr.version == SMC_V1 || !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) return; if (smc->conn.lgr->is_smcd) { memcpy(smc->conn.lgr->negotiated_eid, clc->d1.eid, SMC_MAX_EID_LEN); clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, d1); } else { memcpy(smc->conn.lgr->negotiated_eid, clc->r1.eid, SMC_MAX_EID_LEN); clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, r1); } fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc) + clc_v2_len); smc->conn.lgr->peer_os = fce->os_type; smc->conn.lgr->peer_smc_release = fce->release; if (smc_isascii(fce->hostname)) memcpy(smc->conn.lgr->peer_hostname, fce->hostname, SMC_MAX_HOSTNAME_LEN); } static void smcr_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size); smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx; smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token); smc->conn.peer_rmbe_size = bufsize; atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } static void smcd_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size); smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx; smc->conn.peer_token = ntohll(clc->d0.token); /* msg header takes up space in the buffer */ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; } static void smc_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { if (smc->conn.lgr->is_smcd) smcd_conn_save_peer_info(smc, clc); else smcr_conn_save_peer_info(smc, clc); smc_conn_save_peer_info_fce(smc, clc); } static void smc_link_save_peer_info(struct smc_link *link, struct smc_clc_msg_accept_confirm *clc, struct smc_init_info *ini) { link->peer_qpn = ntoh24(clc->r0.qpn); memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE); memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; } static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, struct smc_stats_fback *fback_arr) { int cnt; for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) { if (fback_arr[cnt].fback_code == smc->fallback_rsn) { fback_arr[cnt].count++; break; } if (!fback_arr[cnt].fback_code) { fback_arr[cnt].fback_code = smc->fallback_rsn; fback_arr[cnt].count++; break; } } } static void smc_stat_fallback(struct smc_sock *smc) { struct net *net = sock_net(&smc->sk); mutex_lock(&net->smc.mutex_fback_rsn); if (smc->listen_smc) { smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv); net->smc.fback_rsn->srv_fback_cnt++; } else { smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt); net->smc.fback_rsn->clnt_fback_cnt++; } mutex_unlock(&net->smc.mutex_fback_rsn); } /* must be called under rcu read lock */ static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) { struct socket_wq *wq; __poll_t flags; wq = rcu_dereference(smc->sk.sk_wq); if (!skwq_has_sleeper(wq)) return; /* wake up smc sk->sk_wq */ if (!key) { /* sk_state_change */ wake_up_interruptible_all(&wq->wait); } else { flags = key_to_poll(key); if (flags & (EPOLLIN | EPOLLOUT)) /* sk_data_ready or sk_write_space */ wake_up_interruptible_sync_poll(&wq->wait, flags); else if (flags & EPOLLERR) /* sk_error_report */ wake_up_interruptible_poll(&wq->wait, flags); } } static int smc_fback_mark_woken(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct smc_mark_woken *mark = container_of(wait, struct smc_mark_woken, wait_entry); mark->woken = true; mark->key = key; return 0; } static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, void (*clcsock_callback)(struct sock *sk)) { struct smc_mark_woken mark = { .woken = false }; struct socket_wq *wq; init_waitqueue_func_entry(&mark.wait_entry, smc_fback_mark_woken); rcu_read_lock(); wq = rcu_dereference(clcsk->sk_wq); if (!wq) goto out; add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); clcsock_callback(clcsk); remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); if (mark.woken) smc_fback_wakeup_waitqueue(smc, mark.key); out: rcu_read_unlock(); } static void smc_fback_state_change(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_data_ready(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_write_space(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_error_report(struct sock *clcsk) { struct smc_sock *smc; read_lock_bh(&clcsk->sk_callback_lock); smc = smc_clcsock_user_data(clcsk); if (smc) smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report); read_unlock_bh(&clcsk->sk_callback_lock); } static void smc_fback_replace_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; write_lock_bh(&clcsk->sk_callback_lock); clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, &smc->clcsk_state_change); smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, &smc->clcsk_data_ready); smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, &smc->clcsk_write_space); smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, &smc->clcsk_error_report); write_unlock_bh(&clcsk->sk_callback_lock); } static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) { int rc = 0; mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { rc = -EBADF; goto out; } smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); trace_smc_switch_to_fallback(smc, reason_code); if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; smc->sk.sk_socket->wq.fasync_list = NULL; /* There might be some wait entries remaining * in smc sk->sk_wq and they should be woken up * as clcsock's wait queue is woken up. */ smc_fback_replace_callbacks(smc); } out: mutex_unlock(&smc->clcsock_release_lock); return rc; } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { struct net *net = sock_net(&smc->sk); int rc = 0; rc = smc_switch_to_fallback(smc, reason_code); if (rc) { /* fallback fails */ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; } smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; } /* decline and fall back during connect */ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, u8 version) { struct net *net = sock_net(&smc->sk); int rc; if (reason_code < 0) { /* error, fallback is not possible */ this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return reason_code; } if (reason_code != SMC_CLC_DECL_PEERDECL) { rc = smc_clc_send_decline(smc, reason_code, version); if (rc < 0) { this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); if (smc->sk.sk_state == SMC_INIT) sock_put(&smc->sk); /* passive closing */ return rc; } } return smc_connect_fallback(smc, reason_code); } static void smc_conn_abort(struct smc_sock *smc, int local_first) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; bool lgr_valid = false; if (smc_conn_lgr_valid(conn)) lgr_valid = true; smc_conn_free(conn); if (local_first && lgr_valid) smc_lgr_cleanup_early(lgr); } /* check if there is a rdma device available for this connection. */ /* called for connect and listen */ static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) { /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ smc_pnet_find_roce_resource(smc->clcsock->sk, ini); if (!ini->check_smcrv2 && !ini->ib_dev) return SMC_CLC_DECL_NOSMCRDEV; if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2) return SMC_CLC_DECL_NOSMCRDEV; return 0; } /* check if there is an ISM device available for this connection. */ /* called for connect and listen */ static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini) { /* Find ISM device with same PNETID as connecting interface */ smc_pnet_find_ism_resource(smc->clcsock->sk, ini); if (!ini->ism_dev[0]) return SMC_CLC_DECL_NOSMCDDEV; else ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]); return 0; } /* is chid unique for the ism devices that are already determined? */ static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini, int cnt) { int i = (!ini->ism_dev[0]) ? 1 : 0; for (; i < cnt; i++) if (ini->ism_chid[i] == chid) return false; return true; } /* determine possible V2 ISM devices (either without PNETID or with PNETID plus * PNETID matching net_device) */ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, struct smc_init_info *ini) { int rc = SMC_CLC_DECL_NOSMCDDEV; struct smcd_dev *smcd; int i = 1, entry = 1; bool is_virtual; u16 chid; if (smcd_indicated(ini->smc_type_v1)) rc = 0; /* already initialized for V1 */ mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->going_away || smcd == ini->ism_dev[0]) continue; chid = smc_ism_get_chid(smcd); if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) continue; is_virtual = __smc_ism_is_virtual(chid); if (!smc_pnet_is_pnetid_set(smcd->pnetid) || smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { if (is_virtual && entry == SMCD_CLC_MAX_V2_GID_ENTRIES) /* It's the last GID-CHID entry left in CLC * Proposal SMC-Dv2 extension, but a virtual * ISM device will take two entries. So give * up it and try the next potential ISM device. */ continue; ini->ism_dev[i] = smcd; ini->ism_chid[i] = chid; ini->is_smcd = true; rc = 0; i++; entry = is_virtual ? entry + 2 : entry + 1; if (entry > SMCD_CLC_MAX_V2_GID_ENTRIES) break; } } mutex_unlock(&smcd_dev_list.mutex); ini->ism_offered_cnt = i - 1; if (!ini->ism_dev[0] && !ini->ism_dev[1]) ini->smcd_version = 0; return rc; } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ static int smc_connect_ism_vlan_setup(struct smc_sock *smc, struct smc_init_info *ini) { if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_ISMVLANERR; return 0; } static int smc_find_proposal_devices(struct smc_sock *smc, struct smc_init_info *ini) { int rc = 0; /* check if there is an ism device available */ if (!(ini->smcd_version & SMC_V1) || smc_find_ism_device(smc, ini) || smc_connect_ism_vlan_setup(smc, ini)) ini->smcd_version &= ~SMC_V1; /* else ISM V1 is supported for this connection */ /* check if there is an rdma device available */ if (!(ini->smcr_version & SMC_V1) || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V1; /* else RDMA is supported for this connection */ ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1, ini->smcr_version & SMC_V1); /* check if there is an ism v2 device available */ if (!(ini->smcd_version & SMC_V2) || !smc_ism_is_v2_capable() || smc_find_ism_v2_device_clnt(smc, ini)) ini->smcd_version &= ~SMC_V2; /* check if there is an rdma v2 device available */ ini->check_smcrv2 = true; ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; if (!(ini->smcr_version & SMC_V2) || smc->clcsock->sk->sk_family != AF_INET || !smc_clc_ueid_count() || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V2; ini->check_smcrv2 = false; ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2, ini->smcr_version & SMC_V2); /* if neither ISM nor RDMA are supported, fallback */ if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) rc = SMC_CLC_DECL_NOSMCDEV; return rc; } /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is * used, the VLAN ID will be registered again during the connection setup. */ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, struct smc_init_info *ini) { if (!smcd_indicated(ini->smc_type_v1)) return 0; if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_CNFERR; return 0; } #define SMC_CLC_MAX_ACCEPT_LEN \ (sizeof(struct smc_clc_msg_accept_confirm) + \ sizeof(struct smc_clc_first_contact_ext_v2x) + \ sizeof(struct smc_clc_msg_trail)) /* CLC handshake during connect */ static int smc_connect_clc(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int rc = 0; /* do inband token exchange */ rc = smc_clc_send_proposal(smc, ini); if (rc) return rc; /* receive SMC Accept CLC message */ return smc_clc_wait_msg(smc, aclc, SMC_CLC_MAX_ACCEPT_LEN, SMC_CLC_ACCEPT, CLC_WAIT_TIME); } void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid) { struct smc_init_info *alt_ini = NULL; memset(gidlist, 0, sizeof(*gidlist)); memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE); alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL); if (!alt_ini) goto out; alt_ini->vlan_id = lgr->vlan_id; alt_ini->check_smcrv2 = true; alt_ini->smcrv2.saddr = lgr->saddr; smc_pnet_find_alt_roce(lgr, alt_ini, known_dev); if (!alt_ini->smcrv2.ib_dev_v2) goto out; memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); out: kfree(alt_ini); } static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { struct smc_clc_first_contact_ext *fce = smc_get_clc_first_contact_ext(aclc, false); struct net *net = sock_net(&smc->sk); int rc; if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) return 0; if (fce->v2_direct) { memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN); ini->smcrv2.uses_gateway = false; } else { if (smc_ib_find_route(net, smc->clcsock->sk->sk_rcv_saddr, smc_ib_gid_to_ipv4(aclc->r0.lcl.gid), ini->smcrv2.nexthop_mac, &ini->smcrv2.uses_gateway)) return SMC_CLC_DECL_NOROUTE; if (!ini->smcrv2.uses_gateway) { /* mismatch: peer claims indirect, but its direct */ return SMC_CLC_DECL_NOINDIRECT; } } ini->release_nr = fce->release; rc = smc_clc_clnt_v2x_features_validate(fce, ini); if (rc) return rc; return 0; } /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int i, reason_code = 0; struct smc_link *link; u8 *eid = NULL; ini->is_smcd = false; ini->ib_clcqpn = ntoh24(aclc->r0.qpn); ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); ini->max_conns = SMC_CONN_PER_LGR_MAX; ini->max_links = SMC_LINKS_ADD_LNK_MAX; reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); if (reason_code) return reason_code; mutex_lock(&smc_client_lgr_pending); reason_code = smc_conn_create(smc, ini); if (reason_code) { mutex_unlock(&smc_client_lgr_pending); return reason_code; } smc_conn_save_peer_info(smc, aclc); if (ini->first_contact_local) { link = smc->conn.lnk; } else { /* set link that was assigned by server */ link = NULL; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *l = &smc->conn.lgr->lnk[i]; if (l->peer_qpn == ntoh24(aclc->r0.qpn) && !memcmp(l->peer_gid, &aclc->r0.lcl.gid, SMC_GID_SIZE) && (aclc->hdr.version > SMC_V1 || !memcmp(l->peer_mac, &aclc->r0.lcl.mac, sizeof(l->peer_mac)))) { link = l; break; } } if (!link) { reason_code = SMC_CLC_DECL_NOSRVLINK; goto connect_abort; } smc_switch_link_and_count(&smc->conn, link); } /* create send buffer and rmb */ if (smc_buf_create(smc, false)) { reason_code = SMC_CLC_DECL_MEM; goto connect_abort; } if (ini->first_contact_local) smc_link_save_peer_info(link, aclc, ini); if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) { reason_code = SMC_CLC_DECL_ERR_RTOK; goto connect_abort; } smc_close_init(smc); smc_rx_init(smc); if (ini->first_contact_local) { if (smc_ib_ready_link(link)) { reason_code = SMC_CLC_DECL_ERR_RDYLNK; goto connect_abort; } } else { /* reg sendbufs if they were vzalloced */ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGBUF; goto connect_abort; } } if (aclc->hdr.version > SMC_V1) { eid = aclc->r1.eid; if (ini->first_contact_local) smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist, link->smcibdev, link->gid); } reason_code = smc_clc_send_confirm(smc, ini->first_contact_local, aclc->hdr.version, eid, ini); if (reason_code) goto connect_abort; smc_tx_init(smc); if (ini->first_contact_local) { /* QP confirmation over RoCE fabric */ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_clnt_conf_first_link(smc); smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) goto connect_abort; } mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; connect_abort: smc_conn_abort(smc, ini->first_contact_local); mutex_unlock(&smc_client_lgr_pending); smc->connect_nonblock = 0; return reason_code; } /* The server has chosen one of the proposed ISM devices for the communication. * Determine from the CHID of the received CLC ACCEPT the ISM device chosen. */ static int smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int i; for (i = 0; i < ini->ism_offered_cnt + 1; i++) { if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { ini->ism_selected = i; return 0; } } return -EPROTO; } /* setup for ISM connection of client */ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { u8 *eid = NULL; int rc = 0; ini->is_smcd = true; ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; if (aclc->hdr.version == SMC_V2) { if (ini->first_contact_peer) { struct smc_clc_first_contact_ext *fce = smc_get_clc_first_contact_ext(aclc, true); ini->release_nr = fce->release; rc = smc_clc_clnt_v2x_features_validate(fce, ini); if (rc) return rc; } rc = smc_v2_determine_accepted_chid(aclc, ini); if (rc) return rc; if (__smc_ism_is_virtual(ini->ism_chid[ini->ism_selected])) ini->ism_peer_gid[ini->ism_selected].gid_ext = ntohll(aclc->d1.gid_ext); /* for non-virtual ISM devices, peer gid_ext remains 0. */ } ini->ism_peer_gid[ini->ism_selected].gid = ntohll(aclc->d0.gid); /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); rc = smc_conn_create(smc, ini); if (rc) { mutex_unlock(&smc_server_lgr_pending); return rc; } /* Create send and receive buffers */ rc = smc_buf_create(smc, true); if (rc) { rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; goto connect_abort; } smc_conn_save_peer_info(smc, aclc); smc_close_init(smc); smc_rx_init(smc); smc_tx_init(smc); if (aclc->hdr.version > SMC_V1) eid = aclc->d1.eid; rc = smc_clc_send_confirm(smc, ini->first_contact_local, aclc->hdr.version, eid, ini); if (rc) goto connect_abort; mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; return 0; connect_abort: smc_conn_abort(smc, ini->first_contact_local); mutex_unlock(&smc_server_lgr_pending); smc->connect_nonblock = 0; return rc; } /* check if received accept type and version matches a proposed one */ static int smc_connect_check_aclc(struct smc_init_info *ini, struct smc_clc_msg_accept_confirm *aclc) { if (aclc->hdr.typev1 != SMC_TYPE_R && aclc->hdr.typev1 != SMC_TYPE_D) return SMC_CLC_DECL_MODEUNSUPP; if (aclc->hdr.version >= SMC_V2) { if ((aclc->hdr.typev1 == SMC_TYPE_R && !smcr_indicated(ini->smc_type_v2)) || (aclc->hdr.typev1 == SMC_TYPE_D && !smcd_indicated(ini->smc_type_v2))) return SMC_CLC_DECL_MODEUNSUPP; } else { if ((aclc->hdr.typev1 == SMC_TYPE_R && !smcr_indicated(ini->smc_type_v1)) || (aclc->hdr.typev1 == SMC_TYPE_D && !smcd_indicated(ini->smc_type_v1))) return SMC_CLC_DECL_MODEUNSUPP; } return 0; } /* perform steps before actually connecting */ static int __smc_connect(struct smc_sock *smc) { u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; struct smc_clc_msg_accept_confirm *aclc; struct smc_init_info *ini = NULL; u8 *buf = NULL; int rc = 0; if (smc->use_fallback) return smc_connect_fallback(smc, smc->fallback_rsn); /* if peer has not signalled SMC-capability, fall back */ if (!tcp_sk(smc->clcsock->sk)->syn_smc) return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC); /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(smc)) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC, version); ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM, version); ini->smcd_version = SMC_V1 | SMC_V2; ini->smcr_version = SMC_V1 | SMC_V2; ini->smc_type_v1 = SMC_TYPE_B; ini->smc_type_v2 = SMC_TYPE_B; /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { ini->smcd_version &= ~SMC_V1; ini->smcr_version = 0; ini->smc_type_v1 = SMC_TYPE_N; if (!ini->smcd_version) { rc = SMC_CLC_DECL_GETVLANERR; goto fallback; } } rc = smc_find_proposal_devices(smc, ini); if (rc) goto fallback; buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL); if (!buf) { rc = SMC_CLC_DECL_MEM; goto fallback; } aclc = (struct smc_clc_msg_accept_confirm *)buf; /* perform CLC handshake */ rc = smc_connect_clc(smc, aclc, ini); if (rc) { /* -EAGAIN on timeout, see tcp_recvmsg() */ if (rc == -EAGAIN) { rc = -ETIMEDOUT; smc->sk.sk_err = ETIMEDOUT; } goto vlan_cleanup; } /* check if smc modes and versions of CLC proposal and accept match */ rc = smc_connect_check_aclc(ini, aclc); version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; if (rc) goto vlan_cleanup; /* depending on previous steps, connect using rdma or ism */ if (aclc->hdr.typev1 == SMC_TYPE_R) { ini->smcr_version = version; rc = smc_connect_rdma(smc, aclc, ini); } else if (aclc->hdr.typev1 == SMC_TYPE_D) { ini->smcd_version = version; rc = smc_connect_ism(smc, aclc, ini); } if (rc) goto vlan_cleanup; SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); smc_connect_ism_vlan_cleanup(smc, ini); kfree(buf); kfree(ini); return 0; vlan_cleanup: smc_connect_ism_vlan_cleanup(smc, ini); kfree(buf); fallback: kfree(ini); return smc_connect_decline_fallback(smc, rc, version); } static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); long timeo = smc->sk.sk_sndtimeo; int rc = 0; if (!timeo) timeo = MAX_SCHEDULE_TIMEOUT; lock_sock(smc->clcsock->sk); if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; } else if ((1 << smc->clcsock->sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); if ((rc == -EPIPE) && ((1 << smc->clcsock->sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))) rc = 0; } release_sock(smc->clcsock->sk); lock_sock(&smc->sk); if (rc != 0 || smc->sk.sk_err) { smc->sk.sk_state = SMC_CLOSED; if (rc == -EPIPE || rc == -EAGAIN) smc->sk.sk_err = EPIPE; else if (rc == -ECONNREFUSED) smc->sk.sk_err = ECONNREFUSED; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); sock_put(&smc->sk); /* passive closing */ goto out; } rc = __smc_connect(smc); if (rc < 0) smc->sk.sk_err = -rc; out: if (!sock_flag(&smc->sk, SOCK_DEAD)) { if (smc->sk.sk_err) { smc->sk.sk_state_change(&smc->sk); } else { /* allow polling before and after fallback decision */ smc->clcsock->sk->sk_write_space(smc->clcsock->sk); smc->sk.sk_write_space(&smc->sk); } } release_sock(&smc->sk); } static int smc_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc = -EINVAL; smc = smc_sk(sk); /* separate smc parameter checking to be safe */ if (alen < sizeof(addr->sa_family)) goto out_err; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) goto out_err; lock_sock(sk); switch (sock->state) { default: rc = -EINVAL; goto out; case SS_CONNECTED: rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL; goto out; case SS_CONNECTING: if (sk->sk_state == SMC_ACTIVE) goto connected; break; case SS_UNCONNECTED: sock->state = SS_CONNECTING; break; } switch (sk->sk_state) { default: goto out; case SMC_CLOSED: rc = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; goto out; case SMC_ACTIVE: rc = -EISCONN; goto out; case SMC_INIT: break; } smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; if (smc->connect_nonblock) { rc = -EALREADY; goto out; } rc = kernel_connect(smc->clcsock, addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; if (smc->use_fallback) { sock->state = rc ? SS_CONNECTING : SS_CONNECTED; goto out; } sock_hold(&smc->sk); /* sock put in passive closing */ if (flags & O_NONBLOCK) { if (queue_work(smc_hs_wq, &smc->connect_work)) smc->connect_nonblock = 1; rc = -EINPROGRESS; goto out; } else { rc = __smc_connect(smc); if (rc < 0) goto out; } connected: rc = 0; sock->state = SS_CONNECTED; out: release_sock(sk); out_err: return rc; } static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) { struct socket *new_clcsock = NULL; struct sock *lsk = &lsmc->sk; struct sock *new_sk; int rc = -EINVAL; release_sock(lsk); new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); if (!new_sk) { rc = -ENOMEM; lsk->sk_err = ENOMEM; *new_smc = NULL; lock_sock(lsk); goto out; } *new_smc = smc_sk(new_sk); mutex_lock(&lsmc->clcsock_release_lock); if (lsmc->clcsock) rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK); mutex_unlock(&lsmc->clcsock_release_lock); lock_sock(lsk); if (rc < 0 && rc != -EAGAIN) lsk->sk_err = -rc; if (rc < 0 || lsk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; smc_sock_set_flag(new_sk, SOCK_DEAD); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; } /* new clcsock has inherited the smc listen-specific sk_data_ready * function; switch it back to the original sk_data_ready function */ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; /* if new clcsock has also inherited the fallback-specific callback * functions, switch them back to the original ones. */ if (lsmc->use_fallback) { if (lsmc->clcsk_state_change) new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; if (lsmc->clcsk_write_space) new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; if (lsmc->clcsk_error_report) new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; } (*new_smc)->clcsock = new_clcsock; out: return rc; } /* add a just created sock to the accept queue of the listen sock as * candidate for a following socket accept call from user space */ static void smc_accept_enqueue(struct sock *parent, struct sock *sk) { struct smc_sock *par = smc_sk(parent); sock_hold(sk); /* sock_put in smc_accept_unlink () */ spin_lock(&par->accept_q_lock); list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); spin_unlock(&par->accept_q_lock); sk_acceptq_added(parent); } /* remove a socket from the accept queue of its parental listening socket */ static void smc_accept_unlink(struct sock *sk) { struct smc_sock *par = smc_sk(sk)->listen_smc; spin_lock(&par->accept_q_lock); list_del_init(&smc_sk(sk)->accept_q); spin_unlock(&par->accept_q_lock); sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); sock_put(sk); /* sock_hold in smc_accept_enqueue */ } /* remove a sock from the accept queue to bind it to a new socket created * for a socket accept call from user space */ struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock) { struct smc_sock *isk, *n; struct sock *new_sk; list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { new_sk = (struct sock *)isk; smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { new_sk->sk_prot->unhash(new_sk); if (isk->clcsock) { sock_release(isk->clcsock); isk->clcsock = NULL; } sock_put(new_sk); /* final */ continue; } if (new_sock) { sock_graft(new_sk, new_sock); new_sock->state = SS_CONNECTED; if (isk->use_fallback) { smc_sk(new_sk)->clcsock->file = new_sock->file; isk->clcsock->file->private_data = isk->clcsock; } } return new_sk; } return NULL; } /* clean up for a created but never accepted sock */ void smc_close_non_accepted(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); sock_hold(sk); /* sock_put below */ lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT); __smc_release(smc); release_sock(sk); sock_put(sk); /* sock_hold above */ sock_put(sk); /* final sock_put */ } static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; struct smc_llc_qentry *qentry; int rc; /* reg the sndbuf if it was vzalloced*/ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_link_reg_buf(link, smc->conn.sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } /* reg the rmb */ if (smcr_link_reg_buf(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; /* send CONFIRM LINK request to client over the RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_CONFIRM_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } smc_llc_save_peer_uid(qentry); rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); if (rc) return SMC_CLC_DECL_RMBE_EC; /* confirm_rkey is implicit on 1st contact */ smc->conn.rmb_desc->is_conf_rkey = true; smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); if (link->lgr->max_links > 1) { down_write(&link->lgr->llc_conf_mutex); /* initial contact - try to establish second link */ smc_llc_srv_add_link(link, NULL); up_write(&link->lgr->llc_conf_mutex); } return 0; } /* listen worker: finish */ static void smc_listen_out(struct smc_sock *new_smc) { struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); release_sock(&lsmc->sk); } else { /* no longer listening */ smc_close_non_accepted(newsmcsk); } /* Wake up accept */ lsmc->sk.sk_data_ready(&lsmc->sk); sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ } /* listen worker: finish in state connected */ static void smc_listen_out_connected(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; smc_listen_out(new_smc); } /* listen worker: finish in error state */ static void smc_listen_out_err(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; struct net *net = sock_net(newsmcsk); this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt); if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; smc_listen_out(new_smc); } /* listen worker: decline and fall back if possible */ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, int local_first, u8 version) { /* RDMA setup failed, switch back to TCP */ smc_conn_abort(new_smc, local_first); if (reason_code < 0 || smc_switch_to_fallback(new_smc, reason_code)) { /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); return; } } smc_listen_out_connected(new_smc); } /* listen worker: version checking */ static int smc_listen_v2_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext; struct smc_clc_v2_extension *pclc_v2_ext; int rc = SMC_CLC_DECL_PEERNOSMC; ini->smc_type_v1 = pclc->hdr.typev1; ini->smc_type_v2 = pclc->hdr.typev2; ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0; ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0; if (pclc->hdr.version > SMC_V1) { if (smcd_indicated(ini->smc_type_v2)) ini->smcd_version |= SMC_V2; if (smcr_indicated(ini->smc_type_v2)) ini->smcr_version |= SMC_V2; } if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) { rc = SMC_CLC_DECL_PEERNOSMC; goto out; } pclc_v2_ext = smc_get_clc_v2_ext(pclc); if (!pclc_v2_ext) { ini->smcd_version &= ~SMC_V2; ini->smcr_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOV2EXT; goto out; } pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); if (ini->smcd_version & SMC_V2) { if (!smc_ism_is_v2_capable()) { ini->smcd_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOISM2SUPP; } else if (!pclc_smcd_v2_ext) { ini->smcd_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOV2DEXT; } else if (!pclc_v2_ext->hdr.eid_cnt && !pclc_v2_ext->hdr.flag.seid) { ini->smcd_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOUEID; } } if (ini->smcr_version & SMC_V2) { if (!pclc_v2_ext->hdr.eid_cnt) { ini->smcr_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOUEID; } } ini->release_nr = pclc_v2_ext->hdr.flag.release; if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE) ini->release_nr = SMC_RELEASE; out: if (!ini->smcd_version && !ini->smcr_version) return rc; return 0; } /* listen worker: check prefixes */ static int smc_listen_prfx_check(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc) { struct smc_clc_msg_proposal_prefix *pclc_prfx; struct socket *newclcsock = new_smc->clcsock; if (pclc->hdr.typev1 == SMC_TYPE_N) return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (smc_clc_prfx_match(newclcsock, pclc_prfx)) return SMC_CLC_DECL_DIFFPREFIX; return 0; } /* listen worker: initialize connection and buffers */ static int smc_listen_rdma_init(struct smc_sock *new_smc, struct smc_init_info *ini) { int rc; /* allocate connection / link group */ rc = smc_conn_create(new_smc, ini); if (rc) return rc; /* create send buffer and rmb */ if (smc_buf_create(new_smc, false)) { smc_conn_abort(new_smc, ini->first_contact_local); return SMC_CLC_DECL_MEM; } return 0; } /* listen worker: initialize connection and buffers for SMC-D */ static int smc_listen_ism_init(struct smc_sock *new_smc, struct smc_init_info *ini) { int rc; rc = smc_conn_create(new_smc, ini); if (rc) return rc; /* Create send and receive buffers */ rc = smc_buf_create(new_smc, true); if (rc) { smc_conn_abort(new_smc, ini->first_contact_local); return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; } return 0; } static bool smc_is_already_selected(struct smcd_dev *smcd, struct smc_init_info *ini, int matches) { int i; for (i = 0; i < matches; i++) if (smcd == ini->ism_dev[i]) return true; return false; } /* check for ISM devices matching proposed ISM devices */ static void smc_check_ism_v2_match(struct smc_init_info *ini, u16 proposed_chid, struct smcd_gid *proposed_gid, unsigned int *matches) { struct smcd_dev *smcd; list_for_each_entry(smcd, &smcd_dev_list.list, list) { if (smcd->going_away) continue; if (smc_is_already_selected(smcd, ini, *matches)) continue; if (smc_ism_get_chid(smcd) == proposed_chid && !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { ini->ism_peer_gid[*matches].gid = proposed_gid->gid; if (__smc_ism_is_virtual(proposed_chid)) ini->ism_peer_gid[*matches].gid_ext = proposed_gid->gid_ext; /* non-virtual ISM's peer gid_ext remains 0. */ ini->ism_dev[*matches] = smcd; (*matches)++; break; } } } static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini) { if (!ini->rc) ini->rc = rc; } static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_smcd_v2_extension *smcd_v2_ext; struct smc_clc_v2_extension *smc_v2_ext; struct smc_clc_msg_smcd *pclc_smcd; unsigned int matches = 0; struct smcd_gid smcd_gid; u8 smcd_version; u8 *eid = NULL; int i, rc; u16 chid; if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) goto not_found; pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) { /* check for ISM device matching proposed native ISM device */ smcd_gid.gid = ntohll(pclc_smcd->ism.gid); smcd_gid.gid_ext = 0; smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid), &smcd_gid, &matches); } for (i = 0; i < smc_v2_ext->hdr.ism_gid_cnt; i++) { /* check for ISM devices matching proposed non-native ISM * devices */ smcd_gid.gid = ntohll(smcd_v2_ext->gidchid[i].gid); smcd_gid.gid_ext = 0; chid = ntohs(smcd_v2_ext->gidchid[i].chid); if (__smc_ism_is_virtual(chid)) { if ((i + 1) == smc_v2_ext->hdr.ism_gid_cnt || chid != ntohs(smcd_v2_ext->gidchid[i + 1].chid)) /* each virtual ISM device takes two GID-CHID * entries and CHID of the second entry repeats * that of the first entry. * * So check if the next GID-CHID entry exists * and both two entries' CHIDs are the same. */ continue; smcd_gid.gid_ext = ntohll(smcd_v2_ext->gidchid[++i].gid); } smc_check_ism_v2_match(ini, chid, &smcd_gid, &matches); } mutex_unlock(&smcd_dev_list.mutex); if (!ini->ism_dev[0]) { smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); goto not_found; } smc_ism_get_system_eid(&eid); if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, smcd_v2_ext->system_eid, eid)) goto not_found; /* separate - outside the smcd_dev_list.lock */ smcd_version = ini->smcd_version; for (i = 0; i < matches; i++) { ini->smcd_version = SMC_V2; ini->is_smcd = true; ini->ism_selected = i; rc = smc_listen_ism_init(new_smc, ini); if (rc) { smc_find_ism_store_rc(rc, ini); /* try next active ISM device */ continue; } return; /* matching and usable V2 ISM device found */ } /* no V2 ISM device could be initialized */ ini->smcd_version = smcd_version; /* restore original value */ ini->negotiated_eid[0] = 0; not_found: ini->smcd_version &= ~SMC_V2; ini->ism_dev[0] = NULL; ini->is_smcd = false; } static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc); int rc = 0; /* check if ISM V1 is available */ if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) goto not_found; ini->is_smcd = true; /* prepare ISM check */ ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid); ini->ism_peer_gid[0].gid_ext = 0; rc = smc_find_ism_device(new_smc, ini); if (rc) goto not_found; ini->ism_selected = 0; rc = smc_listen_ism_init(new_smc, ini); if (!rc) return; /* V1 ISM device found */ not_found: smc_find_ism_store_rc(rc, ini); ini->smcd_version &= ~SMC_V1; ini->ism_dev[0] = NULL; ini->is_smcd = false; } /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) { struct smc_connection *conn = &new_smc->conn; if (!local_first) { /* reg sendbufs if they were vzalloced */ if (conn->sndbuf_desc->is_vm) { if (smcr_lgr_reg_sndbufs(conn->lnk, conn->sndbuf_desc)) return SMC_CLC_DECL_ERR_REGBUF; } if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGBUF; } return 0; } static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { struct smc_clc_v2_extension *smc_v2_ext; u8 smcr_version; int rc; if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2)) goto not_found; smc_v2_ext = smc_get_clc_v2_ext(pclc); if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) goto not_found; /* prepare RDMA check */ memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE); memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); ini->check_smcrv2 = true; ini->smcrv2.clc_sk = new_smc->clcsock->sk; ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr; ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); rc = smc_find_rdma_device(new_smc, ini); if (rc) { smc_find_ism_store_rc(rc, ini); goto not_found; } if (!ini->smcrv2.uses_gateway) memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN); smcr_version = ini->smcr_version; ini->smcr_version = SMC_V2; rc = smc_listen_rdma_init(new_smc, ini); if (!rc) { rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); if (rc) smc_conn_abort(new_smc, ini->first_contact_local); } if (!rc) return; ini->smcr_version = smcr_version; smc_find_ism_store_rc(rc, ini); not_found: ini->smcr_version &= ~SMC_V2; ini->smcrv2.ib_dev_v2 = NULL; ini->check_smcrv2 = false; } static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { int rc; if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1)) return SMC_CLC_DECL_NOSMCDEV; /* prepare RDMA check */ memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE); memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); rc = smc_find_rdma_device(new_smc, ini); if (rc) { /* no RDMA device found */ return SMC_CLC_DECL_NOSMCDEV; } rc = smc_listen_rdma_init(new_smc, ini); if (rc) return rc; return smc_listen_rdma_reg(new_smc, ini->first_contact_local); } /* determine the local device matching to proposal */ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { int prfx_rc; /* check for ISM device matching V2 proposed device */ smc_find_ism_v2_device_serv(new_smc, pclc, ini); if (ini->ism_dev[0]) return 0; /* check for matching IP prefix and subnet length (V1) */ prfx_rc = smc_listen_prfx_check(new_smc, pclc); if (prfx_rc) smc_find_ism_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) return ini->rc ?: SMC_CLC_DECL_GETVLANERR; /* check for ISM device matching V1 proposed device */ if (!prfx_rc) smc_find_ism_v1_device_serv(new_smc, pclc, ini); if (ini->ism_dev[0]) return 0; if (!smcr_indicated(pclc->hdr.typev1) && !smcr_indicated(pclc->hdr.typev2)) /* skip RDMA and decline */ return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; /* check if RDMA V2 is available */ smc_find_rdma_v2_device_serv(new_smc, pclc, ini); if (ini->smcrv2.ib_dev_v2) return 0; /* check if RDMA V1 is available */ if (!prfx_rc) { int rc; rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); smc_find_ism_store_rc(rc, ini); return (!rc) ? 0 : ini->rc; } return prfx_rc; } /* listen worker: finish RDMA setup */ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, bool local_first, struct smc_init_info *ini) { struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; if (local_first) smc_link_save_peer_info(link, cclc, ini); if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) return SMC_CLC_DECL_ERR_RTOK; if (local_first) { if (smc_ib_ready_link(link)) return SMC_CLC_DECL_ERR_RDYLNK; /* QP confirmation over RoCE fabric */ smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); reason_code = smcr_serv_conf_first_link(new_smc); smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); } return reason_code; } /* setup for connection of server */ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm *cclc; struct smc_clc_msg_proposal_area *buf; struct smc_clc_msg_proposal *pclc; struct smc_init_info *ini = NULL; u8 proposal_version = SMC_V1; u8 accept_version; int rc = 0; if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) return smc_listen_out_err(new_smc); if (new_smc->use_fallback) { smc_listen_out_connected(new_smc); return; } /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); if (rc) smc_listen_out_err(new_smc); else smc_listen_out_connected(new_smc); return; } /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) { rc = SMC_CLC_DECL_MEM; goto out_decl; } pclc = (struct smc_clc_msg_proposal *)buf; rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf), SMC_CLC_PROPOSAL, CLC_WAIT_TIME); if (rc) goto out_decl; if (pclc->hdr.version > SMC_V1) proposal_version = SMC_V2; /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(new_smc)) { rc = SMC_CLC_DECL_IPSEC; goto out_decl; } ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) { rc = SMC_CLC_DECL_MEM; goto out_decl; } /* initial version checking */ rc = smc_listen_v2_check(new_smc, pclc, ini); if (rc) goto out_decl; rc = smc_clc_srv_v2x_features_validate(new_smc, pclc, ini); if (rc) goto out_decl; mutex_lock(&smc_server_lgr_pending); smc_close_init(new_smc); smc_rx_init(new_smc); smc_tx_init(new_smc); /* determine ISM or RoCE device used for connection */ rc = smc_listen_find_device(new_smc, pclc, ini); if (rc) goto out_unlock; /* send SMC Accept CLC message */ accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; rc = smc_clc_send_accept(new_smc, ini->first_contact_local, accept_version, ini->negotiated_eid, ini); if (rc) goto out_unlock; /* SMC-D does not need this lock any more */ if (ini->is_smcd) mutex_unlock(&smc_server_lgr_pending); /* receive SMC Confirm CLC message */ memset(buf, 0, sizeof(*buf)); cclc = (struct smc_clc_msg_accept_confirm *)buf; rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf), SMC_CLC_CONFIRM, CLC_WAIT_TIME); if (rc) { if (!ini->is_smcd) goto out_unlock; goto out_decl; } rc = smc_clc_v2x_features_confirm_check(cclc, ini); if (rc) { if (!ini->is_smcd) goto out_unlock; goto out_decl; } /* fce smc release version is needed in smc_listen_rdma_finish, * so save fce info here. */ smc_conn_save_peer_info_fce(new_smc, cclc); /* finish worker */ if (!ini->is_smcd) { rc = smc_listen_rdma_finish(new_smc, cclc, ini->first_contact_local, ini); if (rc) goto out_unlock; mutex_unlock(&smc_server_lgr_pending); } smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); goto out_free; out_unlock: mutex_unlock(&smc_server_lgr_pending); out_decl: smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, proposal_version); out_free: kfree(ini); kfree(buf); } static void smc_tcp_listen_work(struct work_struct *work) { struct smc_sock *lsmc = container_of(work, struct smc_sock, tcp_listen_work); struct sock *lsk = &lsmc->sk; struct smc_sock *new_smc; int rc = 0; lock_sock(lsk); while (lsk->sk_state == SMC_LISTEN) { rc = smc_clcsock_accept(lsmc, &new_smc); if (rc) /* clcsock accept queue empty or error */ goto out; if (!new_smc) continue; if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_inc(&lsmc->queued_smc_hs); new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); sock_hold(&new_smc->sk); /* sock_put in passive closing */ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); } out: release_sock(lsk); sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } static void smc_clcsock_data_ready(struct sock *listen_clcsock) { struct smc_sock *lsmc; read_lock_bh(&listen_clcsock->sk_callback_lock); lsmc = smc_clcsock_user_data(listen_clcsock); if (!lsmc) goto out; lsmc->clcsk_data_ready(listen_clcsock); if (lsmc->sk.sk_state == SMC_LISTEN) { sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) sock_put(&lsmc->sk); } out: read_unlock_bh(&listen_clcsock->sk_callback_lock); } static int smc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc; smc = smc_sk(sk); lock_sock(sk); rc = -EINVAL; if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || smc->connect_nonblock || sock->state != SS_UNCONNECTED) goto out; rc = 0; if (sk->sk_state == SMC_LISTEN) { sk->sk_max_ack_backlog = backlog; goto out; } /* some socket options are handled in core, so we could not apply * them to the clc socket -- copy smc socket options to clc socket */ smc_copy_sock_settings_to_clc(smc); if (!smc->use_fallback) tcp_sk(smc->clcsock->sk)->syn_smc = 1; /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, smc_clcsock_data_ready, &smc->clcsk_data_ready); write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); /* save original ops */ smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; smc->af_ops = *smc->ori_af_ops; smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock; inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; if (smc->limit_smc_hs) tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; rc = kernel_listen(smc->clcsock, backlog); if (rc) { write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; out: release_sock(sk); return rc; } static int smc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern) { struct sock *sk = sock->sk, *nsk; DECLARE_WAITQUEUE(wait, current); struct smc_sock *lsmc; long timeo; int rc = 0; lsmc = smc_sk(sk); sock_hold(sk); /* sock_put below */ lock_sock(sk); if (lsmc->sk.sk_state != SMC_LISTEN) { rc = -EINVAL; release_sock(sk); goto out; } /* Wait for an incoming connection */ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { rc = -EAGAIN; break; } release_sock(sk); timeo = schedule_timeout(timeo); /* wakeup by sk_data_ready in smc_listen_work() */ sched_annotate_sleep(); lock_sock(sk); if (signal_pending(current)) { rc = sock_intr_errno(timeo); break; } } set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); if (!rc) rc = sock_error(nsk); release_sock(sk); if (rc) goto out; if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { /* wait till data arrives on the socket */ timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * MSEC_PER_SEC); if (smc_sk(nsk)->use_fallback) { struct sock *clcsk = smc_sk(nsk)->clcsock->sk; lock_sock(clcsk); if (skb_queue_empty(&clcsk->sk_receive_queue)) sk_wait_data(clcsk, &timeo, NULL); release_sock(clcsk); } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { lock_sock(nsk); smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); release_sock(nsk); } } out: sock_put(sk); /* sock_hold above */ return rc; } static int smc_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct smc_sock *smc; if (peer && (sock->sk->sk_state != SMC_ACTIVE) && (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) return -ENOTCONN; smc = smc_sk(sock->sk); return smc->clcsock->ops->getname(smc->clcsock, addr, peer); } static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc; smc = smc_sk(sk); lock_sock(sk); /* SMC does not support connect with fastopen */ if (msg->msg_flags & MSG_FASTOPEN) { /* not connected yet, fallback */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); if (rc) goto out; } else { rc = -EINVAL; goto out; } } else if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_INIT)) { rc = -EPIPE; goto out; } if (smc->use_fallback) { rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); } else { rc = smc_tx_sendmsg(smc, msg, len); SMC_STAT_TX_PAYLOAD(smc, len, rc); } out: release_sock(sk); return rc; } static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc = -ENOTCONN; smc = smc_sk(sk); lock_sock(sk); if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { /* socket was connected before, no more data to read */ rc = 0; goto out; } if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) || (sk->sk_state == SMC_CLOSED)) goto out; if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { rc = 0; goto out; } if (smc->use_fallback) { rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); } else { msg->msg_namelen = 0; rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); SMC_STAT_RX_PAYLOAD(smc, rc, rc); } out: release_sock(sk); return rc; } static __poll_t smc_accept_poll(struct sock *parent) { struct smc_sock *isk = smc_sk(parent); __poll_t mask = 0; spin_lock(&isk->accept_q_lock); if (!list_empty(&isk->accept_q)) mask = EPOLLIN | EPOLLRDNORM; spin_unlock(&isk->accept_q_lock); return mask; } static __poll_t smc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct smc_sock *smc; __poll_t mask = 0; if (!sk) return EPOLLNVAL; smc = smc_sk(sock->sk); if (smc->use_fallback) { /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; } else { if (sk->sk_state != SMC_CLOSED) sock_poll_wait(file, sock, wait); if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) mask |= EPOLLHUP; if (sk->sk_state == SMC_LISTEN) { /* woken up by sk_data_ready in smc_listen_work() */ mask |= smc_accept_poll(sk); } else if (smc->use_fallback) { /* as result of connect_work()*/ mask |= smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; } else { if ((sk->sk_state != SMC_INIT && atomic_read(&smc->conn.sndbuf_space)) || sk->sk_shutdown & SEND_SHUTDOWN) { mask |= EPOLLOUT | EPOLLWRNORM; } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); } if (atomic_read(&smc->conn.bytes_to_rcv)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (sk->sk_state == SMC_APPCLOSEWAIT1) mask |= EPOLLIN; if (smc->conn.urg_state == SMC_URG_VALID) mask |= EPOLLPRI; } } return mask; } static int smc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; bool do_shutdown = true; struct smc_sock *smc; int rc = -EINVAL; int old_state; int rc1 = 0; smc = smc_sk(sk); if ((how < SHUT_RD) || (how > SHUT_RDWR)) return rc; lock_sock(sk); if (sock->state == SS_CONNECTING) { if (sk->sk_state == SMC_ACTIVE) sock->state = SS_CONNECTED; else if (sk->sk_state == SMC_PEERCLOSEWAIT1 || sk->sk_state == SMC_PEERCLOSEWAIT2 || sk->sk_state == SMC_APPCLOSEWAIT1 || sk->sk_state == SMC_APPCLOSEWAIT2 || sk->sk_state == SMC_APPFINCLOSEWAIT) sock->state = SS_DISCONNECTING; } rc = -ENOTCONN; if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_PEERCLOSEWAIT1) && (sk->sk_state != SMC_PEERCLOSEWAIT2) && (sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_APPCLOSEWAIT2) && (sk->sk_state != SMC_APPFINCLOSEWAIT)) goto out; if (smc->use_fallback) { rc = kernel_sock_shutdown(smc->clcsock, how); sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; sk->sk_socket->state = SS_UNCONNECTED; sock_put(sk); } goto out; } switch (how) { case SHUT_RDWR: /* shutdown in both directions */ old_state = sk->sk_state; rc = smc_close_active(smc); if (old_state == SMC_ACTIVE && sk->sk_state == SMC_PEERCLOSEWAIT1) do_shutdown = false; break; case SHUT_WR: rc = smc_close_shutdown_write(smc); break; case SHUT_RD: rc = 0; /* nothing more to do because peer is not involved */ break; } if (do_shutdown && smc->clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; if (sk->sk_state == SMC_CLOSED) sock->state = SS_UNCONNECTED; else sock->state = SS_DISCONNECTING; out: release_sock(sk); return rc ? rc : rc1; } static int __smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; int val, len; smc = smc_sk(sock->sk); if (get_user(len, optlen)) return -EFAULT; len = min_t(int, len, sizeof(int)); if (len < 0) return -EINVAL; switch (optname) { case SMC_LIMIT_HS: val = smc->limit_smc_hs; break; default: return -EOPNOTSUPP; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int __smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; int val, rc; smc = smc_sk(sk); lock_sock(sk); switch (optname) { case SMC_LIMIT_HS: if (optlen < sizeof(int)) { rc = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(int))) { rc = -EFAULT; break; } smc->limit_smc_hs = !!val; rc = 0; break; default: rc = -EOPNOTSUPP; break; } release_sock(sk); return rc; } static int smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; int val, rc; if (level == SOL_TCP && optname == TCP_ULP) return -EOPNOTSUPP; else if (level == SOL_SMC) return __smc_setsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sk); /* generic setsockopts reaching us here always apply to the * CLC socket */ mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } if (unlikely(!smc->clcsock->ops->setsockopt)) rc = -EOPNOTSUPP; else rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, optval, optlen); if (smc->clcsock->sk->sk_err) { sk->sk_err = smc->clcsock->sk->sk_err; sk_error_report(sk); } mutex_unlock(&smc->clcsock_release_lock); if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; lock_sock(sk); if (rc || smc->use_fallback) goto out; switch (optname) { case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } break; case TCP_NODELAY: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { if (val) { SMC_STAT_INC(smc, ndly_cnt); smc_tx_pending(&smc->conn); cancel_delayed_work(&smc->conn.tx_work); } } break; case TCP_CORK: if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_CLOSED) { if (!val) { SMC_STAT_INC(smc, cork_cnt); smc_tx_pending(&smc->conn); cancel_delayed_work(&smc->conn.tx_work); } } break; case TCP_DEFER_ACCEPT: smc->sockopt_defer_accept = val; break; default: break; } out: release_sock(sk); return rc; } static int smc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct smc_sock *smc; int rc; if (level == SOL_SMC) return __smc_getsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sock->sk); mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { mutex_unlock(&smc->clcsock_release_lock); return -EBADF; } /* socket options apply to the CLC socket */ if (unlikely(!smc->clcsock->ops->getsockopt)) { mutex_unlock(&smc->clcsock_release_lock); return -EOPNOTSUPP; } rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, optval, optlen); mutex_unlock(&smc->clcsock_release_lock); return rc; } static int smc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { union smc_host_cursor cons, urg; struct smc_connection *conn; struct smc_sock *smc; int answ; smc = smc_sk(sock->sk); conn = &smc->conn; lock_sock(&smc->sk); if (smc->use_fallback) { if (!smc->clcsock) { release_sock(&smc->sk); return -EBADF; } answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); release_sock(&smc->sk); return answ; } switch (cmd) { case SIOCINQ: /* same as FIONREAD */ if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; else answ = atomic_read(&smc->conn.bytes_to_rcv); break; case SIOCOUTQ: /* output queue size (not send + not acked) */ if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; else answ = smc->conn.sndbuf_desc->len - atomic_read(&smc->conn.sndbuf_space); break; case SIOCOUTQNSD: /* output queue size (not send only) */ if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; else answ = smc_tx_prepared_sends(&smc->conn); break; case SIOCATMARK: if (smc->sk.sk_state == SMC_LISTEN) { release_sock(&smc->sk); return -EINVAL; } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) { answ = 0; } else { smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); smc_curs_copy(&urg, &conn->urg_curs, conn); answ = smc_curs_diff(conn->rmb_desc->len, &cons, &urg) == 1; } break; default: release_sock(&smc->sk); return -ENOIOCTLCMD; } release_sock(&smc->sk); return put_user(answ, (int __user *)arg); } /* Map the affected portions of the rmbe into an spd, note the number of bytes * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor * updates till whenever a respective page has been fully processed. * Note that subsequent recv() calls have to wait till all splice() processing * completed. */ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; int rc = -ENOTCONN; smc = smc_sk(sk); lock_sock(sk); if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) { /* socket was connected before, no more data to read */ rc = 0; goto out; } if (sk->sk_state == SMC_INIT || sk->sk_state == SMC_LISTEN || sk->sk_state == SMC_CLOSED) goto out; if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { rc = 0; goto out; } if (smc->use_fallback) { rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, pipe, len, flags); } else { if (*ppos) { rc = -ESPIPE; goto out; } if (flags & SPLICE_F_NONBLOCK) flags = MSG_DONTWAIT; else flags = 0; SMC_STAT_INC(smc, splice_cnt); rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); } out: release_sock(sk); return rc; } /* must look like tcp */ static const struct proto_ops smc_sock_ops = { .family = PF_SMC, .owner = THIS_MODULE, .release = smc_release, .bind = smc_bind, .connect = smc_connect, .socketpair = sock_no_socketpair, .accept = smc_accept, .getname = smc_getname, .poll = smc_poll, .ioctl = smc_ioctl, .listen = smc_listen, .shutdown = smc_shutdown, .setsockopt = smc_setsockopt, .getsockopt = smc_getsockopt, .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, .splice_read = smc_splice_read, }; static int __smc_create(struct net *net, struct socket *sock, int protocol, int kern, struct socket *clcsock) { int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; struct smc_sock *smc; struct sock *sk; int rc; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_STREAM) goto out; rc = -EPROTONOSUPPORT; if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) goto out; rc = -ENOBUFS; sock->ops = &smc_sock_ops; sock->state = SS_UNCONNECTED; sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; /* default behavior from limit_smc_hs in every net namespace */ smc->limit_smc_hs = net->smc.limit_smc_hs; rc = 0; if (!clcsock) { rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &smc->clcsock); if (rc) { sk_common_release(sk); goto out; } /* smc_clcsock_release() does not wait smc->clcsock->sk's * destruction; its sk_state might not be TCP_CLOSE after * smc->sk is close()d, and TCP timers can be fired later, * which need net ref. */ sk = smc->clcsock->sk; __netns_tracker_free(net, &sk->ns_tracker, false); sk->sk_net_refcnt = 1; get_net_track(net, &sk->ns_tracker, GFP_KERNEL); sock_inuse_add(net, 1); } else { smc->clcsock = clcsock; } out: return rc; } static int smc_create(struct net *net, struct socket *sock, int protocol, int kern) { return __smc_create(net, sock, protocol, kern, NULL); } static const struct net_proto_family smc_sock_family_ops = { .family = PF_SMC, .owner = THIS_MODULE, .create = smc_create, }; static int smc_ulp_init(struct sock *sk) { struct socket *tcp = sk->sk_socket; struct net *net = sock_net(sk); struct socket *smcsock; int protocol, ret; /* only TCP can be replaced */ if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) return -ESOCKTNOSUPPORT; /* don't handle wq now */ if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list) return -ENOTCONN; if (sk->sk_family == AF_INET) protocol = SMCPROTO_SMC; else protocol = SMCPROTO_SMC6; smcsock = sock_alloc(); if (!smcsock) return -ENFILE; smcsock->type = SOCK_STREAM; __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */ ret = __smc_create(net, smcsock, protocol, 1, tcp); if (ret) { sock_release(smcsock); /* module_put() which ops won't be NULL */ return ret; } /* replace tcp socket to smc */ smcsock->file = tcp->file; smcsock->file->private_data = smcsock; smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */ smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */ tcp->file = NULL; return ret; } static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(newsk); /* don't inherit ulp ops to child when listen */ icsk->icsk_ulp_ops = NULL; } static struct tcp_ulp_ops smc_ulp_ops __read_mostly = { .name = "smc", .owner = THIS_MODULE, .init = smc_ulp_init, .clone = smc_ulp_clone, }; unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { int rc; rc = smc_sysctl_net_init(net); if (rc) return rc; return smc_pnet_net_init(net); } static void __net_exit smc_net_exit(struct net *net) { smc_sysctl_net_exit(net); smc_pnet_net_exit(net); } static __net_init int smc_net_stat_init(struct net *net) { return smc_stats_init(net); } static void __net_exit smc_net_stat_exit(struct net *net) { smc_stats_exit(net); } static struct pernet_operations smc_net_ops = { .init = smc_net_init, .exit = smc_net_exit, .id = &smc_net_id, .size = sizeof(struct smc_net), }; static struct pernet_operations smc_net_stat_ops = { .init = smc_net_stat_init, .exit = smc_net_stat_exit, }; static int __init smc_init(void) { int rc; rc = register_pernet_subsys(&smc_net_ops); if (rc) return rc; rc = register_pernet_subsys(&smc_net_stat_ops); if (rc) goto out_pernet_subsys; rc = smc_ism_init(); if (rc) goto out_pernet_subsys_stat; smc_clc_init(); rc = smc_nl_init(); if (rc) goto out_ism; rc = smc_pnet_init(); if (rc) goto out_nl; rc = -ENOMEM; smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); if (!smc_tcp_ls_wq) goto out_pnet; smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); if (!smc_hs_wq) goto out_alloc_tcp_ls_wq; smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); if (!smc_close_wq) goto out_alloc_hs_wq; rc = smc_core_init(); if (rc) { pr_err("%s: smc_core_init fails with %d\n", __func__, rc); goto out_alloc_wqs; } rc = smc_llc_init(); if (rc) { pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); goto out_core; } rc = smc_cdc_init(); if (rc) { pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); goto out_core; } rc = proto_register(&smc_proto, 1); if (rc) { pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); goto out_core; } rc = proto_register(&smc_proto6, 1); if (rc) { pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); goto out_proto; } rc = sock_register(&smc_sock_family_ops); if (rc) { pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { pr_err("%s: ib_register fails with %d\n", __func__, rc); goto out_sock; } rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); goto out_ib; } static_branch_enable(&tcp_have_smc); return 0; out_ib: smc_ib_unregister_client(); out_sock: sock_unregister(PF_SMC); out_proto6: proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); out_core: smc_core_exit(); out_alloc_wqs: destroy_workqueue(smc_close_wq); out_alloc_hs_wq: destroy_workqueue(smc_hs_wq); out_alloc_tcp_ls_wq: destroy_workqueue(smc_tcp_ls_wq); out_pnet: smc_pnet_exit(); out_nl: smc_nl_exit(); out_ism: smc_clc_exit(); smc_ism_exit(); out_pernet_subsys_stat: unregister_pernet_subsys(&smc_net_stat_ops); out_pernet_subsys: unregister_pernet_subsys(&smc_net_ops); return rc; } static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); smc_nl_exit(); smc_clc_exit(); unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); rcu_barrier(); } module_init(smc_init); module_exit(smc_exit); MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); MODULE_ALIAS_TCP_ULP("smc"); MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME);
15 15 15 15 15 15 15 15 15 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 // SPDX-License-Identifier: GPL-2.0-or-later /* PKCS#7 parser * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "PKCS7: "fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/oid_registry.h> #include <crypto/public_key.h> #include "pkcs7_parser.h" #include "pkcs7.asn1.h" MODULE_DESCRIPTION("PKCS#7 parser"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); struct pkcs7_parse_context { struct pkcs7_message *msg; /* Message being constructed */ struct pkcs7_signed_info *sinfo; /* SignedInfo being constructed */ struct pkcs7_signed_info **ppsinfo; struct x509_certificate *certs; /* Certificate cache */ struct x509_certificate **ppcerts; unsigned long data; /* Start of data */ enum OID last_oid; /* Last OID encountered */ unsigned x509_index; unsigned sinfo_index; const void *raw_serial; unsigned raw_serial_size; unsigned raw_issuer_size; const void *raw_issuer; const void *raw_skid; unsigned raw_skid_size; bool expect_skid; }; /* * Free a signed information block. */ static void pkcs7_free_signed_info(struct pkcs7_signed_info *sinfo) { if (sinfo) { public_key_signature_free(sinfo->sig); kfree(sinfo); } } /** * pkcs7_free_message - Free a PKCS#7 message * @pkcs7: The PKCS#7 message to free */ void pkcs7_free_message(struct pkcs7_message *pkcs7) { struct x509_certificate *cert; struct pkcs7_signed_info *sinfo; if (pkcs7) { while (pkcs7->certs) { cert = pkcs7->certs; pkcs7->certs = cert->next; x509_free_certificate(cert); } while (pkcs7->crl) { cert = pkcs7->crl; pkcs7->crl = cert->next; x509_free_certificate(cert); } while (pkcs7->signed_infos) { sinfo = pkcs7->signed_infos; pkcs7->signed_infos = sinfo->next; pkcs7_free_signed_info(sinfo); } kfree(pkcs7); } } EXPORT_SYMBOL_GPL(pkcs7_free_message); /* * Check authenticatedAttributes are provided or not provided consistently. */ static int pkcs7_check_authattrs(struct pkcs7_message *msg) { struct pkcs7_signed_info *sinfo; bool want = false; sinfo = msg->signed_infos; if (!sinfo) goto inconsistent; if (sinfo->authattrs) { want = true; msg->have_authattrs = true; } for (sinfo = sinfo->next; sinfo; sinfo = sinfo->next) if (!!sinfo->authattrs != want) goto inconsistent; return 0; inconsistent: pr_warn("Inconsistently supplied authAttrs\n"); return -EINVAL; } /** * pkcs7_parse_message - Parse a PKCS#7 message * @data: The raw binary ASN.1 encoded message to be parsed * @datalen: The size of the encoded message */ struct pkcs7_message *pkcs7_parse_message(const void *data, size_t datalen) { struct pkcs7_parse_context *ctx; struct pkcs7_message *msg = ERR_PTR(-ENOMEM); int ret; ctx = kzalloc(sizeof(struct pkcs7_parse_context), GFP_KERNEL); if (!ctx) goto out_no_ctx; ctx->msg = kzalloc(sizeof(struct pkcs7_message), GFP_KERNEL); if (!ctx->msg) goto out_no_msg; ctx->sinfo = kzalloc(sizeof(struct pkcs7_signed_info), GFP_KERNEL); if (!ctx->sinfo) goto out_no_sinfo; ctx->sinfo->sig = kzalloc(sizeof(struct public_key_signature), GFP_KERNEL); if (!ctx->sinfo->sig) goto out_no_sig; ctx->data = (unsigned long)data; ctx->ppcerts = &ctx->certs; ctx->ppsinfo = &ctx->msg->signed_infos; /* Attempt to decode the signature */ ret = asn1_ber_decoder(&pkcs7_decoder, ctx, data, datalen); if (ret < 0) { msg = ERR_PTR(ret); goto out; } ret = pkcs7_check_authattrs(ctx->msg); if (ret < 0) { msg = ERR_PTR(ret); goto out; } msg = ctx->msg; ctx->msg = NULL; out: while (ctx->certs) { struct x509_certificate *cert = ctx->certs; ctx->certs = cert->next; x509_free_certificate(cert); } out_no_sig: pkcs7_free_signed_info(ctx->sinfo); out_no_sinfo: pkcs7_free_message(ctx->msg); out_no_msg: kfree(ctx); out_no_ctx: return msg; } EXPORT_SYMBOL_GPL(pkcs7_parse_message); /** * pkcs7_get_content_data - Get access to the PKCS#7 content * @pkcs7: The preparsed PKCS#7 message to access * @_data: Place to return a pointer to the data * @_data_len: Place to return the data length * @_headerlen: Size of ASN.1 header not included in _data * * Get access to the data content of the PKCS#7 message. The size of the * header of the ASN.1 object that contains it is also provided and can be used * to adjust *_data and *_data_len to get the entire object. * * Returns -ENODATA if the data object was missing from the message. */ int pkcs7_get_content_data(const struct pkcs7_message *pkcs7, const void **_data, size_t *_data_len, size_t *_headerlen) { if (!pkcs7->data) return -ENODATA; *_data = pkcs7->data; *_data_len = pkcs7->data_len; if (_headerlen) *_headerlen = pkcs7->data_hdrlen; return 0; } EXPORT_SYMBOL_GPL(pkcs7_get_content_data); /* * Note an OID when we find one for later processing when we know how * to interpret it. */ int pkcs7_note_OID(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; ctx->last_oid = look_up_OID(value, vlen); if (ctx->last_oid == OID__NR) { char buffer[50]; sprint_oid(value, vlen, buffer, sizeof(buffer)); printk("PKCS7: Unknown OID: [%lu] %s\n", (unsigned long)value - ctx->data, buffer); } return 0; } /* * Note the digest algorithm for the signature. */ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; switch (ctx->last_oid) { case OID_sha256: ctx->sinfo->sig->hash_algo = "sha256"; break; case OID_sha384: ctx->sinfo->sig->hash_algo = "sha384"; break; case OID_sha512: ctx->sinfo->sig->hash_algo = "sha512"; break; case OID_sha224: ctx->sinfo->sig->hash_algo = "sha224"; break; case OID_sm3: ctx->sinfo->sig->hash_algo = "sm3"; break; case OID_gost2012Digest256: ctx->sinfo->sig->hash_algo = "streebog256"; break; case OID_gost2012Digest512: ctx->sinfo->sig->hash_algo = "streebog512"; break; case OID_sha3_256: ctx->sinfo->sig->hash_algo = "sha3-256"; break; case OID_sha3_384: ctx->sinfo->sig->hash_algo = "sha3-384"; break; case OID_sha3_512: ctx->sinfo->sig->hash_algo = "sha3-512"; break; default: printk("Unsupported digest algo: %u\n", ctx->last_oid); return -ENOPKG; } return 0; } /* * Note the public key algorithm for the signature. */ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; switch (ctx->last_oid) { case OID_rsaEncryption: ctx->sinfo->sig->pkey_algo = "rsa"; ctx->sinfo->sig->encoding = "pkcs1"; break; case OID_id_ecdsa_with_sha224: case OID_id_ecdsa_with_sha256: case OID_id_ecdsa_with_sha384: case OID_id_ecdsa_with_sha512: case OID_id_ecdsa_with_sha3_256: case OID_id_ecdsa_with_sha3_384: case OID_id_ecdsa_with_sha3_512: ctx->sinfo->sig->pkey_algo = "ecdsa"; ctx->sinfo->sig->encoding = "x962"; break; case OID_SM2_with_SM3: ctx->sinfo->sig->pkey_algo = "sm2"; ctx->sinfo->sig->encoding = "raw"; break; case OID_gost2012PKey256: case OID_gost2012PKey512: ctx->sinfo->sig->pkey_algo = "ecrdsa"; ctx->sinfo->sig->encoding = "raw"; break; default: printk("Unsupported pkey algo: %u\n", ctx->last_oid); return -ENOPKG; } return 0; } /* * We only support signed data [RFC2315 sec 9]. */ int pkcs7_check_content_type(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; if (ctx->last_oid != OID_signed_data) { pr_warn("Only support pkcs7_signedData type\n"); return -EINVAL; } return 0; } /* * Note the SignedData version */ int pkcs7_note_signeddata_version(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; unsigned version; if (vlen != 1) goto unsupported; ctx->msg->version = version = *(const u8 *)value; switch (version) { case 1: /* PKCS#7 SignedData [RFC2315 sec 9.1] * CMS ver 1 SignedData [RFC5652 sec 5.1] */ break; case 3: /* CMS ver 3 SignedData [RFC2315 sec 5.1] */ break; default: goto unsupported; } return 0; unsupported: pr_warn("Unsupported SignedData version\n"); return -EINVAL; } /* * Note the SignerInfo version */ int pkcs7_note_signerinfo_version(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; unsigned version; if (vlen != 1) goto unsupported; version = *(const u8 *)value; switch (version) { case 1: /* PKCS#7 SignerInfo [RFC2315 sec 9.2] * CMS ver 1 SignerInfo [RFC5652 sec 5.3] */ if (ctx->msg->version != 1) goto version_mismatch; ctx->expect_skid = false; break; case 3: /* CMS ver 3 SignerInfo [RFC2315 sec 5.3] */ if (ctx->msg->version == 1) goto version_mismatch; ctx->expect_skid = true; break; default: goto unsupported; } return 0; unsupported: pr_warn("Unsupported SignerInfo version\n"); return -EINVAL; version_mismatch: pr_warn("SignedData-SignerInfo version mismatch\n"); return -EBADMSG; } /* * Extract a certificate and store it in the context. */ int pkcs7_extract_cert(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; struct x509_certificate *x509; if (tag != ((ASN1_UNIV << 6) | ASN1_CONS_BIT | ASN1_SEQ)) { pr_debug("Cert began with tag %02x at %lu\n", tag, (unsigned long)ctx - ctx->data); return -EBADMSG; } /* We have to correct for the header so that the X.509 parser can start * from the beginning. Note that since X.509 stipulates DER, there * probably shouldn't be an EOC trailer - but it is in PKCS#7 (which * stipulates BER). */ value -= hdrlen; vlen += hdrlen; if (((u8*)value)[1] == 0x80) vlen += 2; /* Indefinite length - there should be an EOC */ x509 = x509_cert_parse(value, vlen); if (IS_ERR(x509)) return PTR_ERR(x509); x509->index = ++ctx->x509_index; pr_debug("Got cert %u for %s\n", x509->index, x509->subject); pr_debug("- fingerprint %*phN\n", x509->id->len, x509->id->data); *ctx->ppcerts = x509; ctx->ppcerts = &x509->next; return 0; } /* * Save the certificate list */ int pkcs7_note_certificate_list(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; pr_devel("Got cert list (%02x)\n", tag); *ctx->ppcerts = ctx->msg->certs; ctx->msg->certs = ctx->certs; ctx->certs = NULL; ctx->ppcerts = &ctx->certs; return 0; } /* * Note the content type. */ int pkcs7_note_content(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; if (ctx->last_oid != OID_data && ctx->last_oid != OID_msIndirectData) { pr_warn("Unsupported data type %d\n", ctx->last_oid); return -EINVAL; } ctx->msg->data_type = ctx->last_oid; return 0; } /* * Extract the data from the message and store that and its content type OID in * the context. */ int pkcs7_note_data(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; pr_debug("Got data\n"); ctx->msg->data = value; ctx->msg->data_len = vlen; ctx->msg->data_hdrlen = hdrlen; return 0; } /* * Parse authenticated attributes. */ int pkcs7_sig_note_authenticated_attr(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; struct pkcs7_signed_info *sinfo = ctx->sinfo; enum OID content_type; pr_devel("AuthAttr: %02x %zu [%*ph]\n", tag, vlen, (unsigned)vlen, value); switch (ctx->last_oid) { case OID_contentType: if (__test_and_set_bit(sinfo_has_content_type, &sinfo->aa_set)) goto repeated; content_type = look_up_OID(value, vlen); if (content_type != ctx->msg->data_type) { pr_warn("Mismatch between global data type (%d) and sinfo %u (%d)\n", ctx->msg->data_type, sinfo->index, content_type); return -EBADMSG; } return 0; case OID_signingTime: if (__test_and_set_bit(sinfo_has_signing_time, &sinfo->aa_set)) goto repeated; /* Should we check that the signing time is consistent * with the signer's X.509 cert? */ return x509_decode_time(&sinfo->signing_time, hdrlen, tag, value, vlen); case OID_messageDigest: if (__test_and_set_bit(sinfo_has_message_digest, &sinfo->aa_set)) goto repeated; if (tag != ASN1_OTS) return -EBADMSG; sinfo->msgdigest = value; sinfo->msgdigest_len = vlen; return 0; case OID_smimeCapabilites: if (__test_and_set_bit(sinfo_has_smime_caps, &sinfo->aa_set)) goto repeated; if (ctx->msg->data_type != OID_msIndirectData) { pr_warn("S/MIME Caps only allowed with Authenticode\n"); return -EKEYREJECTED; } return 0; /* Microsoft SpOpusInfo seems to be contain cont[0] 16-bit BE * char URLs and cont[1] 8-bit char URLs. * * Microsoft StatementType seems to contain a list of OIDs that * are also used as extendedKeyUsage types in X.509 certs. */ case OID_msSpOpusInfo: if (__test_and_set_bit(sinfo_has_ms_opus_info, &sinfo->aa_set)) goto repeated; goto authenticode_check; case OID_msStatementType: if (__test_and_set_bit(sinfo_has_ms_statement_type, &sinfo->aa_set)) goto repeated; authenticode_check: if (ctx->msg->data_type != OID_msIndirectData) { pr_warn("Authenticode AuthAttrs only allowed with Authenticode\n"); return -EKEYREJECTED; } /* I'm not sure how to validate these */ return 0; default: return 0; } repeated: /* We permit max one item per AuthenticatedAttribute and no repeats */ pr_warn("Repeated/multivalue AuthAttrs not permitted\n"); return -EKEYREJECTED; } /* * Note the set of auth attributes for digestion purposes [RFC2315 sec 9.3] */ int pkcs7_sig_note_set_of_authattrs(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; struct pkcs7_signed_info *sinfo = ctx->sinfo; if (!test_bit(sinfo_has_content_type, &sinfo->aa_set) || !test_bit(sinfo_has_message_digest, &sinfo->aa_set)) { pr_warn("Missing required AuthAttr\n"); return -EBADMSG; } if (ctx->msg->data_type != OID_msIndirectData && test_bit(sinfo_has_ms_opus_info, &sinfo->aa_set)) { pr_warn("Unexpected Authenticode AuthAttr\n"); return -EBADMSG; } /* We need to switch the 'CONT 0' to a 'SET OF' when we digest */ sinfo->authattrs = value - (hdrlen - 1); sinfo->authattrs_len = vlen + (hdrlen - 1); return 0; } /* * Note the issuing certificate serial number */ int pkcs7_sig_note_serial(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; ctx->raw_serial = value; ctx->raw_serial_size = vlen; return 0; } /* * Note the issuer's name */ int pkcs7_sig_note_issuer(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; ctx->raw_issuer = value; ctx->raw_issuer_size = vlen; return 0; } /* * Note the issuing cert's subjectKeyIdentifier */ int pkcs7_sig_note_skid(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; pr_devel("SKID: %02x %zu [%*ph]\n", tag, vlen, (unsigned)vlen, value); ctx->raw_skid = value; ctx->raw_skid_size = vlen; return 0; } /* * Note the signature data */ int pkcs7_sig_note_signature(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; ctx->sinfo->sig->s = kmemdup(value, vlen, GFP_KERNEL); if (!ctx->sinfo->sig->s) return -ENOMEM; ctx->sinfo->sig->s_size = vlen; return 0; } /* * Note a signature information block */ int pkcs7_note_signed_info(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { struct pkcs7_parse_context *ctx = context; struct pkcs7_signed_info *sinfo = ctx->sinfo; struct asymmetric_key_id *kid; if (ctx->msg->data_type == OID_msIndirectData && !sinfo->authattrs) { pr_warn("Authenticode requires AuthAttrs\n"); return -EBADMSG; } /* Generate cert issuer + serial number key ID */ if (!ctx->expect_skid) { kid = asymmetric_key_generate_id(ctx->raw_serial, ctx->raw_serial_size, ctx->raw_issuer, ctx->raw_issuer_size); } else { kid = asymmetric_key_generate_id(ctx->raw_skid, ctx->raw_skid_size, "", 0); } if (IS_ERR(kid)) return PTR_ERR(kid); pr_devel("SINFO KID: %u [%*phN]\n", kid->len, kid->len, kid->data); sinfo->sig->auth_ids[0] = kid; sinfo->index = ++ctx->sinfo_index; *ctx->ppsinfo = sinfo; ctx->ppsinfo = &sinfo->next; ctx->sinfo = kzalloc(sizeof(struct pkcs7_signed_info), GFP_KERNEL); if (!ctx->sinfo) return -ENOMEM; ctx->sinfo->sig = kzalloc(sizeof(struct public_key_signature), GFP_KERNEL); if (!ctx->sinfo->sig) return -ENOMEM; return 0; }
2 2 1 1 3 3 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 // SPDX-License-Identifier: GPL-2.0-only /* * Linux VM pressure * * Copyright 2012 Linaro Ltd. * Anton Vorontsov <anton.vorontsov@linaro.org> * * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. */ #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/log2.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/eventfd.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/printk.h> #include <linux/vmpressure.h> /* * The window size (vmpressure_win) is the number of scanned pages before * we try to analyze scanned/reclaimed ratio. So the window is used as a * rate-limit tunable for the "low" level notification, and also for * averaging the ratio for medium/critical levels. Using small window * sizes can cause lot of false positives, but too big window size will * delay the notifications. * * As the vmscan reclaimer logic works with chunks which are multiple of * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. * * TODO: Make the window size depend on machine size, as we do for vmstat * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). */ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; /* * These thresholds are used when we account memory pressure through * scanned/reclaimed ratio. The current values were chosen empirically. In * essence, they are percents: the higher the value, the more number * unsuccessful reclaims there were. */ static const unsigned int vmpressure_level_med = 60; static const unsigned int vmpressure_level_critical = 95; /* * When there are too little pages left to scan, vmpressure() may miss the * critical pressure as number of pages will be less than "window size". * However, in that case the vmscan priority will raise fast as the * reclaimer will try to scan LRUs more deeply. * * The vmscan logic considers these special priorities: * * prio == DEF_PRIORITY (12): reclaimer starts with that value * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed * prio == 0 : close to OOM, kernel scans every page in an lru * * Any value in this range is acceptable for this tunable (i.e. from 12 to * 0). Current value for the vmpressure_level_critical_prio is chosen * empirically, but the number, in essence, means that we consider * critical level when scanning depth is ~10% of the lru size (vmscan * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one * eights). */ static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); static struct vmpressure *work_to_vmpressure(struct work_struct *work) { return container_of(work, struct vmpressure, work); } static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr); memcg = parent_mem_cgroup(memcg); if (!memcg) return NULL; return memcg_to_vmpressure(memcg); } enum vmpressure_levels { VMPRESSURE_LOW = 0, VMPRESSURE_MEDIUM, VMPRESSURE_CRITICAL, VMPRESSURE_NUM_LEVELS, }; enum vmpressure_modes { VMPRESSURE_NO_PASSTHROUGH = 0, VMPRESSURE_HIERARCHY, VMPRESSURE_LOCAL, VMPRESSURE_NUM_MODES, }; static const char * const vmpressure_str_levels[] = { [VMPRESSURE_LOW] = "low", [VMPRESSURE_MEDIUM] = "medium", [VMPRESSURE_CRITICAL] = "critical", }; static const char * const vmpressure_str_modes[] = { [VMPRESSURE_NO_PASSTHROUGH] = "default", [VMPRESSURE_HIERARCHY] = "hierarchy", [VMPRESSURE_LOCAL] = "local", }; static enum vmpressure_levels vmpressure_level(unsigned long pressure) { if (pressure >= vmpressure_level_critical) return VMPRESSURE_CRITICAL; else if (pressure >= vmpressure_level_med) return VMPRESSURE_MEDIUM; return VMPRESSURE_LOW; } static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, unsigned long reclaimed) { unsigned long scale = scanned + reclaimed; unsigned long pressure = 0; /* * reclaimed can be greater than scanned for things such as reclaimed * slab pages. shrink_node() just adds reclaimed pages without a * related increment to scanned pages. */ if (reclaimed >= scanned) goto out; /* * We calculate the ratio (in percents) of how many pages were * scanned vs. reclaimed in a given time frame (window). Note that * time is in VM reclaimer's "ticks", i.e. number of pages * scanned. This makes it possible to set desired reaction time * and serves as a ratelimit. */ pressure = scale - (reclaimed * scale / scanned); pressure = pressure * 100 / scale; out: pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, scanned, reclaimed); return vmpressure_level(pressure); } struct vmpressure_event { struct eventfd_ctx *efd; enum vmpressure_levels level; enum vmpressure_modes mode; struct list_head node; }; static bool vmpressure_event(struct vmpressure *vmpr, const enum vmpressure_levels level, bool ancestor, bool signalled) { struct vmpressure_event *ev; bool ret = false; mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { if (ancestor && ev->mode == VMPRESSURE_LOCAL) continue; if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) continue; if (level < ev->level) continue; eventfd_signal(ev->efd); ret = true; } mutex_unlock(&vmpr->events_lock); return ret; } static void vmpressure_work_fn(struct work_struct *work) { struct vmpressure *vmpr = work_to_vmpressure(work); unsigned long scanned; unsigned long reclaimed; enum vmpressure_levels level; bool ancestor = false; bool signalled = false; spin_lock(&vmpr->sr_lock); /* * Several contexts might be calling vmpressure(), so it is * possible that the work was rescheduled again before the old * work context cleared the counters. In that case we will run * just after the old work returns, but then scanned might be zero * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ scanned = vmpr->tree_scanned; if (!scanned) { spin_unlock(&vmpr->sr_lock); return; } reclaimed = vmpr->tree_reclaimed; vmpr->tree_scanned = 0; vmpr->tree_reclaimed = 0; spin_unlock(&vmpr->sr_lock); level = vmpressure_calc_level(scanned, reclaimed); do { if (vmpressure_event(vmpr, level, ancestor, signalled)) signalled = true; ancestor = true; } while ((vmpr = vmpressure_parent(vmpr))); } /** * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle * @tree: legacy subtree mode * @scanned: number of pages scanned * @reclaimed: number of pages reclaimed * * This function should be called from the vmscan reclaim path to account * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * pressure index is then further refined and averaged over time. * * If @tree is set, vmpressure is in traditional userspace reporting * mode: @memcg is considered the pressure root and userspace is * notified of the entire subtree's reclaim efficiency. * * If @tree is not set, reclaim efficiency is recorded for @memcg, and * only in-kernel users are notified. * * This function does not return any value. */ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr; if (mem_cgroup_disabled()) return; /* * The in-kernel users only care about the reclaim efficiency * for this @memcg rather than the whole subtree, and there * isn't and won't be any in-kernel user in a legacy cgroup. */ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree) return; vmpr = memcg_to_vmpressure(memcg); /* * Here we only want to account pressure that userland is able to * help us with. For example, suppose that DMA zone is under * pressure; if we notify userland about that kind of pressure, * then it will be mostly a waste as it will trigger unnecessary * freeing of memory by userland (since userland is more likely to * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That * is why we include only movable, highmem and FS/IO pages. * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so * we account it too. */ if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) return; /* * If we got here with no pages scanned, then that is an indicator * that reclaimer was unable to find any shrinkable LRUs at the * current scanning depth. But it does not mean that we should * report the critical pressure, yet. If the scanning priority * (scanning depth) goes too high (deep), we will be notified * through vmpressure_prio(). But so far, keep calm. */ if (!scanned) return; if (tree) { spin_lock(&vmpr->sr_lock); scanned = vmpr->tree_scanned += scanned; vmpr->tree_reclaimed += reclaimed; spin_unlock(&vmpr->sr_lock); if (scanned < vmpressure_win) return; schedule_work(&vmpr->work); } else { enum vmpressure_levels level; /* For now, no users for root-level efficiency */ if (!memcg || mem_cgroup_is_root(memcg)) return; spin_lock(&vmpr->sr_lock); scanned = vmpr->scanned += scanned; reclaimed = vmpr->reclaimed += reclaimed; if (scanned < vmpressure_win) { spin_unlock(&vmpr->sr_lock); return; } vmpr->scanned = vmpr->reclaimed = 0; spin_unlock(&vmpr->sr_lock); level = vmpressure_calc_level(scanned, reclaimed); if (level > VMPRESSURE_LOW) { /* * Let the socket buffer allocator know that * we are having trouble reclaiming LRU pages. * * For hysteresis keep the pressure state * asserted for a second in which subsequent * pressure events can occur. */ WRITE_ONCE(memcg->socket_pressure, jiffies + HZ); } } } /** * vmpressure_prio() - Account memory pressure through reclaimer priority level * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle * @prio: reclaimer's priority * * This function should be called from the reclaim path every time when * the vmscan's reclaiming priority (scanning depth) changes. * * This function does not return any value. */ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) { /* * We only use prio for accounting critical level. For more info * see comment for vmpressure_level_critical_prio variable above. */ if (prio > vmpressure_level_critical_prio) return; /* * OK, the prio is below the threshold, updating vmpressure * information before shrinker dives into long shrinking of long * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 * to the vmpressure() basically means that we signal 'critical' * level. */ vmpressure(gfp, memcg, true, vmpressure_win, 0); } #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd * @memcg: memcg that is interested in vmpressure notifications * @eventfd: eventfd context to link notifications with * @args: event arguments (pressure level threshold, optional mode) * * This function associates eventfd context with the vmpressure * infrastructure, so that the notifications will be delivered to the * @eventfd. The @args parameter is a comma-delimited string that denotes a * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. * "hierarchy" or "local"). * * To be used as memcg event method. * * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could * not be parsed. */ int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; enum vmpressure_levels level; char *spec, *spec_orig; char *token; int ret = 0; spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); if (!spec) return -ENOMEM; /* Find required level */ token = strsep(&spec, ","); ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); if (ret < 0) goto out; level = ret; /* Find optional mode */ token = strsep(&spec, ","); if (token) { ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); if (ret < 0) goto out; mode = ret; } ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) { ret = -ENOMEM; goto out; } ev->efd = eventfd; ev->level = level; ev->mode = mode; mutex_lock(&vmpr->events_lock); list_add(&ev->node, &vmpr->events); mutex_unlock(&vmpr->events_lock); ret = 0; out: kfree(spec_orig); return ret; } /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure * @memcg: memcg handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * * This function does internal manipulations to detach the @eventfd from * the vmpressure notifications, and then frees internal resources * associated with the @eventfd (but the @eventfd itself is not freed). * * To be used as memcg event method. */ void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { if (ev->efd != eventfd) continue; list_del(&ev->node); kfree(ev); break; } mutex_unlock(&vmpr->events_lock); } /** * vmpressure_init() - Initialize vmpressure control structure * @vmpr: Structure to be initialized * * This function should be called on every allocated vmpressure structure * before any usage. */ void vmpressure_init(struct vmpressure *vmpr) { spin_lock_init(&vmpr->sr_lock); mutex_init(&vmpr->events_lock); INIT_LIST_HEAD(&vmpr->events); INIT_WORK(&vmpr->work, vmpressure_work_fn); } /** * vmpressure_cleanup() - shuts down vmpressure control structure * @vmpr: Structure to be cleaned up * * This function should be called before the structure in which it is * embedded is cleaned up. */ void vmpressure_cleanup(struct vmpressure *vmpr) { /* * Make sure there is no pending work before eventfd infrastructure * goes away. */ flush_work(&vmpr->work); }
366 355 157 291 802 1349 366 917 918 737 711 635 2424 2421 5 3 141 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * * Authors: * Mimi Zohar <zohar@us.ibm.com> * * File: integrity_iint.c * - implements the integrity hooks: integrity_inode_alloc, * integrity_inode_free * - cache integrity information associated with an inode * using a rbtree tree. */ #include <linux/slab.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/file.h> #include <linux/uaccess.h> #include <linux/security.h> #include <linux/lsm_hooks.h> #include "integrity.h" static struct rb_root integrity_iint_tree = RB_ROOT; static DEFINE_RWLOCK(integrity_iint_lock); static struct kmem_cache *iint_cache __ro_after_init; struct dentry *integrity_dir; /* * __integrity_iint_find - return the iint associated with an inode */ static struct integrity_iint_cache *__integrity_iint_find(struct inode *inode) { struct integrity_iint_cache *iint; struct rb_node *n = integrity_iint_tree.rb_node; while (n) { iint = rb_entry(n, struct integrity_iint_cache, rb_node); if (inode < iint->inode) n = n->rb_left; else if (inode > iint->inode) n = n->rb_right; else return iint; } return NULL; } /* * integrity_iint_find - return the iint associated with an inode */ struct integrity_iint_cache *integrity_iint_find(struct inode *inode) { struct integrity_iint_cache *iint; if (!IS_IMA(inode)) return NULL; read_lock(&integrity_iint_lock); iint = __integrity_iint_find(inode); read_unlock(&integrity_iint_lock); return iint; } #define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH+1) /* * It is not clear that IMA should be nested at all, but as long is it measures * files both on overlayfs and on underlying fs, we need to annotate the iint * mutex to avoid lockdep false positives related to IMA + overlayfs. * See ovl_lockdep_annotate_inode_mutex_key() for more details. */ static inline void iint_lockdep_annotate(struct integrity_iint_cache *iint, struct inode *inode) { #ifdef CONFIG_LOCKDEP static struct lock_class_key iint_mutex_key[IMA_MAX_NESTING]; int depth = inode->i_sb->s_stack_depth; if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING)) depth = 0; lockdep_set_class(&iint->mutex, &iint_mutex_key[depth]); #endif } static void iint_init_always(struct integrity_iint_cache *iint, struct inode *inode) { iint->ima_hash = NULL; iint->version = 0; iint->flags = 0UL; iint->atomic_flags = 0UL; iint->ima_file_status = INTEGRITY_UNKNOWN; iint->ima_mmap_status = INTEGRITY_UNKNOWN; iint->ima_bprm_status = INTEGRITY_UNKNOWN; iint->ima_read_status = INTEGRITY_UNKNOWN; iint->ima_creds_status = INTEGRITY_UNKNOWN; iint->evm_status = INTEGRITY_UNKNOWN; iint->measured_pcrs = 0; mutex_init(&iint->mutex); iint_lockdep_annotate(iint, inode); } static void iint_free(struct integrity_iint_cache *iint) { kfree(iint->ima_hash); mutex_destroy(&iint->mutex); kmem_cache_free(iint_cache, iint); } /** * integrity_inode_get - find or allocate an iint associated with an inode * @inode: pointer to the inode * @return: allocated iint * * Caller must lock i_mutex */ struct integrity_iint_cache *integrity_inode_get(struct inode *inode) { struct rb_node **p; struct rb_node *node, *parent = NULL; struct integrity_iint_cache *iint, *test_iint; iint = integrity_iint_find(inode); if (iint) return iint; iint = kmem_cache_alloc(iint_cache, GFP_NOFS); if (!iint) return NULL; iint_init_always(iint, inode); write_lock(&integrity_iint_lock); p = &integrity_iint_tree.rb_node; while (*p) { parent = *p; test_iint = rb_entry(parent, struct integrity_iint_cache, rb_node); if (inode < test_iint->inode) { p = &(*p)->rb_left; } else if (inode > test_iint->inode) { p = &(*p)->rb_right; } else { write_unlock(&integrity_iint_lock); kmem_cache_free(iint_cache, iint); return test_iint; } } iint->inode = inode; node = &iint->rb_node; inode->i_flags |= S_IMA; rb_link_node(node, parent, p); rb_insert_color(node, &integrity_iint_tree); write_unlock(&integrity_iint_lock); return iint; } /** * integrity_inode_free - called on security_inode_free * @inode: pointer to the inode * * Free the integrity information(iint) associated with an inode. */ void integrity_inode_free(struct inode *inode) { struct integrity_iint_cache *iint; if (!IS_IMA(inode)) return; write_lock(&integrity_iint_lock); iint = __integrity_iint_find(inode); rb_erase(&iint->rb_node, &integrity_iint_tree); write_unlock(&integrity_iint_lock); iint_free(iint); } static void iint_init_once(void *foo) { struct integrity_iint_cache *iint = (struct integrity_iint_cache *) foo; memset(iint, 0, sizeof(*iint)); } static int __init integrity_iintcache_init(void) { iint_cache = kmem_cache_create("iint_cache", sizeof(struct integrity_iint_cache), 0, SLAB_PANIC, iint_init_once); return 0; } DEFINE_LSM(integrity) = { .name = "integrity", .init = integrity_iintcache_init, .order = LSM_ORDER_LAST, }; /* * integrity_kernel_read - read data from the file * * This is a function for reading file content instead of kernel_read(). * It does not perform locking checks to ensure it cannot be blocked. * It does not perform security checks because it is irrelevant for IMA. * */ int integrity_kernel_read(struct file *file, loff_t offset, void *addr, unsigned long count) { return __kernel_read(file, addr, count, &offset); } /* * integrity_load_keys - load integrity keys hook * * Hooks is called from init/main.c:kernel_init_freeable() * when rootfs is ready */ void __init integrity_load_keys(void) { ima_load_x509(); if (!IS_ENABLED(CONFIG_IMA_LOAD_X509)) evm_load_x509(); } static int __init integrity_fs_init(void) { integrity_dir = securityfs_create_dir("integrity", NULL); if (IS_ERR(integrity_dir)) { int ret = PTR_ERR(integrity_dir); if (ret != -ENODEV) pr_err("Unable to create integrity sysfs dir: %d\n", ret); integrity_dir = NULL; return ret; } return 0; } late_initcall(integrity_fs_init)
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 // SPDX-License-Identifier: GPL-2.0-only /* * ebt_nflog * * Author: * Peter Warasin <peter@endian.com> * * February, 2008 * * Based on: * xt_NFLOG.c, (C) 2006 by Patrick McHardy <kaber@trash.net> * ebt_ulog.c, (C) 2004 by Bart De Schuymer <bdschuym@pandora.be> * */ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_nflog.h> #include <net/netfilter/nf_log.h> static unsigned int ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nflog_info *info = par->targinfo; struct net *net = xt_net(par); struct nf_loginfo li; li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; li.u.ulog.flags = 0; nf_log_packet(net, PF_BRIDGE, xt_hooknum(par), skb, xt_in(par), xt_out(par), &li, "%s", info->prefix); return EBT_CONTINUE; } static int ebt_nflog_tg_check(const struct xt_tgchk_param *par) { struct ebt_nflog_info *info = par->targinfo; if (info->flags & ~EBT_NFLOG_MASK) return -EINVAL; info->prefix[EBT_NFLOG_PREFIX_SIZE - 1] = '\0'; return 0; } static struct xt_target ebt_nflog_tg_reg __read_mostly = { .name = "nflog", .revision = 0, .family = NFPROTO_BRIDGE, .target = ebt_nflog_tg, .checkentry = ebt_nflog_tg_check, .targetsize = sizeof(struct ebt_nflog_info), .me = THIS_MODULE, }; static int __init ebt_nflog_init(void) { return xt_register_target(&ebt_nflog_tg_reg); } static void __exit ebt_nflog_fini(void) { xt_unregister_target(&ebt_nflog_tg_reg); } module_init(ebt_nflog_init); module_exit(ebt_nflog_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Peter Warasin <peter@endian.com>"); MODULE_DESCRIPTION("ebtables NFLOG netfilter logging module");
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * i2c.h - definitions for the Linux i2c bus interface * Copyright (C) 1995-2000 Simon G. Vogl * Copyright (C) 2013-2019 Wolfram Sang <wsa@kernel.org> * * With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and * Frodo Looijaard <frodol@dds.nl> */ #ifndef _LINUX_I2C_H #define _LINUX_I2C_H #include <linux/acpi.h> /* for acpi_handle */ #include <linux/bits.h> #include <linux/mod_devicetable.h> #include <linux/device.h> /* for struct device */ #include <linux/sched.h> /* for completion */ #include <linux/mutex.h> #include <linux/regulator/consumer.h> #include <linux/rtmutex.h> #include <linux/irqdomain.h> /* for Host Notify IRQ */ #include <linux/of.h> /* for struct device_node */ #include <linux/swab.h> /* for swab16 */ #include <uapi/linux/i2c.h> extern const struct bus_type i2c_bus_type; extern struct device_type i2c_adapter_type; extern struct device_type i2c_client_type; /* --- General options ------------------------------------------------ */ struct i2c_msg; struct i2c_algorithm; struct i2c_adapter; struct i2c_client; struct i2c_driver; struct i2c_device_identity; union i2c_smbus_data; struct i2c_board_info; enum i2c_slave_event; typedef int (*i2c_slave_cb_t)(struct i2c_client *client, enum i2c_slave_event event, u8 *val); /* I2C Frequency Modes */ #define I2C_MAX_STANDARD_MODE_FREQ 100000 #define I2C_MAX_FAST_MODE_FREQ 400000 #define I2C_MAX_FAST_MODE_PLUS_FREQ 1000000 #define I2C_MAX_TURBO_MODE_FREQ 1400000 #define I2C_MAX_HIGH_SPEED_MODE_FREQ 3400000 #define I2C_MAX_ULTRA_FAST_MODE_FREQ 5000000 struct module; struct property_entry; #if IS_ENABLED(CONFIG_I2C) /* Return the Frequency mode string based on the bus frequency */ const char *i2c_freq_mode_string(u32 bus_freq_hz); /* * The master routines are the ones normally used to transmit data to devices * on a bus (or read from them). Apart from two basic transfer functions to * transmit one message at a time, a more complex version can be used to * transmit an arbitrary number of messages without interruption. * @count must be less than 64k since msg.len is u16. */ int i2c_transfer_buffer_flags(const struct i2c_client *client, char *buf, int count, u16 flags); /** * i2c_master_recv - issue a single I2C message in master receive mode * @client: Handle to slave device * @buf: Where to store data read from slave * @count: How many bytes to read, must be less than 64k since msg.len is u16 * * Returns negative errno, or else the number of bytes read. */ static inline int i2c_master_recv(const struct i2c_client *client, char *buf, int count) { return i2c_transfer_buffer_flags(client, buf, count, I2C_M_RD); }; /** * i2c_master_recv_dmasafe - issue a single I2C message in master receive mode * using a DMA safe buffer * @client: Handle to slave device * @buf: Where to store data read from slave, must be safe to use with DMA * @count: How many bytes to read, must be less than 64k since msg.len is u16 * * Returns negative errno, or else the number of bytes read. */ static inline int i2c_master_recv_dmasafe(const struct i2c_client *client, char *buf, int count) { return i2c_transfer_buffer_flags(client, buf, count, I2C_M_RD | I2C_M_DMA_SAFE); }; /** * i2c_master_send - issue a single I2C message in master transmit mode * @client: Handle to slave device * @buf: Data that will be written to the slave * @count: How many bytes to write, must be less than 64k since msg.len is u16 * * Returns negative errno, or else the number of bytes written. */ static inline int i2c_master_send(const struct i2c_client *client, const char *buf, int count) { return i2c_transfer_buffer_flags(client, (char *)buf, count, 0); }; /** * i2c_master_send_dmasafe - issue a single I2C message in master transmit mode * using a DMA safe buffer * @client: Handle to slave device * @buf: Data that will be written to the slave, must be safe to use with DMA * @count: How many bytes to write, must be less than 64k since msg.len is u16 * * Returns negative errno, or else the number of bytes written. */ static inline int i2c_master_send_dmasafe(const struct i2c_client *client, const char *buf, int count) { return i2c_transfer_buffer_flags(client, (char *)buf, count, I2C_M_DMA_SAFE); }; /* Transfer num messages. */ int i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); /* Unlocked flavor */ int __i2c_transfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); /* This is the very generalized SMBus access routine. You probably do not want to use this, though; one of the functions below may be much easier, and probably just as fast. Note that we use i2c_adapter here, because you do not need a specific smbus adapter to call this function. */ s32 i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags, char read_write, u8 command, int protocol, union i2c_smbus_data *data); /* Unlocked flavor */ s32 __i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, unsigned short flags, char read_write, u8 command, int protocol, union i2c_smbus_data *data); /* Now follow the 'nice' access routines. These also document the calling conventions of i2c_smbus_xfer. */ u8 i2c_smbus_pec(u8 crc, u8 *p, size_t count); s32 i2c_smbus_read_byte(const struct i2c_client *client); s32 i2c_smbus_write_byte(const struct i2c_client *client, u8 value); s32 i2c_smbus_read_byte_data(const struct i2c_client *client, u8 command); s32 i2c_smbus_write_byte_data(const struct i2c_client *client, u8 command, u8 value); s32 i2c_smbus_read_word_data(const struct i2c_client *client, u8 command); s32 i2c_smbus_write_word_data(const struct i2c_client *client, u8 command, u16 value); static inline s32 i2c_smbus_read_word_swapped(const struct i2c_client *client, u8 command) { s32 value = i2c_smbus_read_word_data(client, command); return (value < 0) ? value : swab16(value); } static inline s32 i2c_smbus_write_word_swapped(const struct i2c_client *client, u8 command, u16 value) { return i2c_smbus_write_word_data(client, command, swab16(value)); } /* Returns the number of read bytes */ s32 i2c_smbus_read_block_data(const struct i2c_client *client, u8 command, u8 *values); s32 i2c_smbus_write_block_data(const struct i2c_client *client, u8 command, u8 length, const u8 *values); /* Returns the number of read bytes */ s32 i2c_smbus_read_i2c_block_data(const struct i2c_client *client, u8 command, u8 length, u8 *values); s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client, u8 command, u8 length, const u8 *values); s32 i2c_smbus_read_i2c_block_data_or_emulated(const struct i2c_client *client, u8 command, u8 length, u8 *values); int i2c_get_device_id(const struct i2c_client *client, struct i2c_device_identity *id); const struct i2c_device_id *i2c_client_get_device_id(const struct i2c_client *client); #endif /* I2C */ /** * struct i2c_device_identity - i2c client device identification * @manufacturer_id: 0 - 4095, database maintained by NXP * @part_id: 0 - 511, according to manufacturer * @die_revision: 0 - 7, according to manufacturer */ struct i2c_device_identity { u16 manufacturer_id; #define I2C_DEVICE_ID_NXP_SEMICONDUCTORS 0 #define I2C_DEVICE_ID_NXP_SEMICONDUCTORS_1 1 #define I2C_DEVICE_ID_NXP_SEMICONDUCTORS_2 2 #define I2C_DEVICE_ID_NXP_SEMICONDUCTORS_3 3 #define I2C_DEVICE_ID_RAMTRON_INTERNATIONAL 4 #define I2C_DEVICE_ID_ANALOG_DEVICES 5 #define I2C_DEVICE_ID_STMICROELECTRONICS 6 #define I2C_DEVICE_ID_ON_SEMICONDUCTOR 7 #define I2C_DEVICE_ID_SPRINTEK_CORPORATION 8 #define I2C_DEVICE_ID_ESPROS_PHOTONICS_AG 9 #define I2C_DEVICE_ID_FUJITSU_SEMICONDUCTOR 10 #define I2C_DEVICE_ID_FLIR 11 #define I2C_DEVICE_ID_O2MICRO 12 #define I2C_DEVICE_ID_ATMEL 13 #define I2C_DEVICE_ID_NONE 0xffff u16 part_id; u8 die_revision; }; enum i2c_alert_protocol { I2C_PROTOCOL_SMBUS_ALERT, I2C_PROTOCOL_SMBUS_HOST_NOTIFY, }; /** * enum i2c_driver_flags - Flags for an I2C device driver * * @I2C_DRV_ACPI_WAIVE_D0_PROBE: Don't put the device in D0 state for probe */ enum i2c_driver_flags { I2C_DRV_ACPI_WAIVE_D0_PROBE = BIT(0), }; /** * struct i2c_driver - represent an I2C device driver * @class: What kind of i2c device we instantiate (for detect) * @probe: Callback for device binding * @remove: Callback for device unbinding * @shutdown: Callback for device shutdown * @alert: Alert callback, for example for the SMBus alert protocol * @command: Callback for bus-wide signaling (optional) * @driver: Device driver model driver * @id_table: List of I2C devices supported by this driver * @detect: Callback for device detection * @address_list: The I2C addresses to probe (for detect) * @clients: List of detected clients we created (for i2c-core use only) * @flags: A bitmask of flags defined in &enum i2c_driver_flags * * The driver.owner field should be set to the module owner of this driver. * The driver.name field should be set to the name of this driver. * * For automatic device detection, both @detect and @address_list must * be defined. @class should also be set, otherwise only devices forced * with module parameters will be created. The detect function must * fill at least the name field of the i2c_board_info structure it is * handed upon successful detection, and possibly also the flags field. * * If @detect is missing, the driver will still work fine for enumerated * devices. Detected devices simply won't be supported. This is expected * for the many I2C/SMBus devices which can't be detected reliably, and * the ones which can always be enumerated in practice. * * The i2c_client structure which is handed to the @detect callback is * not a real i2c_client. It is initialized just enough so that you can * call i2c_smbus_read_byte_data and friends on it. Don't do anything * else with it. In particular, calling dev_dbg and friends on it is * not allowed. */ struct i2c_driver { unsigned int class; /* Standard driver model interfaces */ int (*probe)(struct i2c_client *client); void (*remove)(struct i2c_client *client); /* driver model interfaces that don't relate to enumeration */ void (*shutdown)(struct i2c_client *client); /* Alert callback, for example for the SMBus alert protocol. * The format and meaning of the data value depends on the protocol. * For the SMBus alert protocol, there is a single bit of data passed * as the alert response's low bit ("event flag"). * For the SMBus Host Notify protocol, the data corresponds to the * 16-bit payload data reported by the slave device acting as master. */ void (*alert)(struct i2c_client *client, enum i2c_alert_protocol protocol, unsigned int data); /* a ioctl like command that can be used to perform specific functions * with the device. */ int (*command)(struct i2c_client *client, unsigned int cmd, void *arg); struct device_driver driver; const struct i2c_device_id *id_table; /* Device detection callback for automatic device creation */ int (*detect)(struct i2c_client *client, struct i2c_board_info *info); const unsigned short *address_list; struct list_head clients; u32 flags; }; #define to_i2c_driver(d) container_of(d, struct i2c_driver, driver) /** * struct i2c_client - represent an I2C slave device * @flags: see I2C_CLIENT_* for possible flags * @addr: Address used on the I2C bus connected to the parent adapter. * @name: Indicates the type of the device, usually a chip name that's * generic enough to hide second-sourcing and compatible revisions. * @adapter: manages the bus segment hosting this I2C device * @dev: Driver model device node for the slave. * @init_irq: IRQ that was set at initialization * @irq: indicates the IRQ generated by this device (if any) * @detected: member of an i2c_driver.clients list or i2c-core's * userspace_devices list * @slave_cb: Callback when I2C slave mode of an adapter is used. The adapter * calls it to pass on slave events to the slave driver. * @devres_group_id: id of the devres group that will be created for resources * acquired when probing this device. * * An i2c_client identifies a single device (i.e. chip) connected to an * i2c bus. The behaviour exposed to Linux is defined by the driver * managing the device. */ struct i2c_client { unsigned short flags; /* div., see below */ #define I2C_CLIENT_PEC 0x04 /* Use Packet Error Checking */ #define I2C_CLIENT_TEN 0x10 /* we have a ten bit chip address */ /* Must equal I2C_M_TEN below */ #define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ unsigned short addr; /* chip address - NOTE: 7bit */ /* addresses are stored in the */ /* _LOWER_ 7 bits */ char name[I2C_NAME_SIZE]; struct i2c_adapter *adapter; /* the adapter we sit on */ struct device dev; /* the device structure */ int init_irq; /* irq set at initialization */ int irq; /* irq issued by device */ struct list_head detected; #if IS_ENABLED(CONFIG_I2C_SLAVE) i2c_slave_cb_t slave_cb; /* callback for slave mode */ #endif void *devres_group_id; /* ID of probe devres group */ }; #define to_i2c_client(d) container_of(d, struct i2c_client, dev) struct i2c_adapter *i2c_verify_adapter(struct device *dev); const struct i2c_device_id *i2c_match_id(const struct i2c_device_id *id, const struct i2c_client *client); const void *i2c_get_match_data(const struct i2c_client *client); static inline struct i2c_client *kobj_to_i2c_client(struct kobject *kobj) { struct device * const dev = kobj_to_dev(kobj); return to_i2c_client(dev); } static inline void *i2c_get_clientdata(const struct i2c_client *client) { return dev_get_drvdata(&client->dev); } static inline void i2c_set_clientdata(struct i2c_client *client, void *data) { dev_set_drvdata(&client->dev, data); } /* I2C slave support */ enum i2c_slave_event { I2C_SLAVE_READ_REQUESTED, I2C_SLAVE_WRITE_REQUESTED, I2C_SLAVE_READ_PROCESSED, I2C_SLAVE_WRITE_RECEIVED, I2C_SLAVE_STOP, }; int i2c_slave_register(struct i2c_client *client, i2c_slave_cb_t slave_cb); int i2c_slave_unregister(struct i2c_client *client); int i2c_slave_event(struct i2c_client *client, enum i2c_slave_event event, u8 *val); #if IS_ENABLED(CONFIG_I2C_SLAVE) bool i2c_detect_slave_mode(struct device *dev); #else static inline bool i2c_detect_slave_mode(struct device *dev) { return false; } #endif /** * struct i2c_board_info - template for device creation * @type: chip type, to initialize i2c_client.name * @flags: to initialize i2c_client.flags * @addr: stored in i2c_client.addr * @dev_name: Overrides the default <busnr>-<addr> dev_name if set * @platform_data: stored in i2c_client.dev.platform_data * @of_node: pointer to OpenFirmware device node * @fwnode: device node supplied by the platform firmware * @swnode: software node for the device * @resources: resources associated with the device * @num_resources: number of resources in the @resources array * @irq: stored in i2c_client.irq * * I2C doesn't actually support hardware probing, although controllers and * devices may be able to use I2C_SMBUS_QUICK to tell whether or not there's * a device at a given address. Drivers commonly need more information than * that, such as chip type, configuration, associated IRQ, and so on. * * i2c_board_info is used to build tables of information listing I2C devices * that are present. This information is used to grow the driver model tree. * For mainboards this is done statically using i2c_register_board_info(); * bus numbers identify adapters that aren't yet available. For add-on boards, * i2c_new_client_device() does this dynamically with the adapter already known. */ struct i2c_board_info { char type[I2C_NAME_SIZE]; unsigned short flags; unsigned short addr; const char *dev_name; void *platform_data; struct device_node *of_node; struct fwnode_handle *fwnode; const struct software_node *swnode; const struct resource *resources; unsigned int num_resources; int irq; }; /** * I2C_BOARD_INFO - macro used to list an i2c device and its address * @dev_type: identifies the device type * @dev_addr: the device's address on the bus. * * This macro initializes essential fields of a struct i2c_board_info, * declaring what has been provided on a particular board. Optional * fields (such as associated irq, or device-specific platform_data) * are provided using conventional syntax. */ #define I2C_BOARD_INFO(dev_type, dev_addr) \ .type = dev_type, .addr = (dev_addr) #if IS_ENABLED(CONFIG_I2C) /* * Add-on boards should register/unregister their devices; e.g. a board * with integrated I2C, a config eeprom, sensors, and a codec that's * used in conjunction with the primary hardware. */ struct i2c_client * i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info); /* If you don't know the exact address of an I2C device, use this variant * instead, which can probe for device presence in a list of possible * addresses. The "probe" callback function is optional. If it is provided, * it must return 1 on successful probe, 0 otherwise. If it is not provided, * a default probing method is used. */ struct i2c_client * i2c_new_scanned_device(struct i2c_adapter *adap, struct i2c_board_info *info, unsigned short const *addr_list, int (*probe)(struct i2c_adapter *adap, unsigned short addr)); /* Common custom probe functions */ int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short addr); struct i2c_client * i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address); struct i2c_client * devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adap, u16 address); struct i2c_client * i2c_new_ancillary_device(struct i2c_client *client, const char *name, u16 default_addr); void i2c_unregister_device(struct i2c_client *client); struct i2c_client *i2c_verify_client(struct device *dev); #else static inline struct i2c_client *i2c_verify_client(struct device *dev) { return NULL; } #endif /* I2C */ /* Mainboard arch_initcall() code should register all its I2C devices. * This is done at arch_initcall time, before declaring any i2c adapters. * Modules for add-on boards must use other calls. */ #ifdef CONFIG_I2C_BOARDINFO int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n); #else static inline int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n) { return 0; } #endif /* I2C_BOARDINFO */ /** * struct i2c_algorithm - represent I2C transfer method * @master_xfer: Issue a set of i2c transactions to the given I2C adapter * defined by the msgs array, with num messages available to transfer via * the adapter specified by adap. * @master_xfer_atomic: same as @master_xfer. Yet, only using atomic context * so e.g. PMICs can be accessed very late before shutdown. Optional. * @smbus_xfer: Issue smbus transactions to the given I2C adapter. If this * is not present, then the bus layer will try and convert the SMBus calls * into I2C transfers instead. * @smbus_xfer_atomic: same as @smbus_xfer. Yet, only using atomic context * so e.g. PMICs can be accessed very late before shutdown. Optional. * @functionality: Return the flags that this algorithm/adapter pair supports * from the ``I2C_FUNC_*`` flags. * @reg_slave: Register given client to I2C slave mode of this adapter * @unreg_slave: Unregister given client from I2C slave mode of this adapter * * The following structs are for those who like to implement new bus drivers: * i2c_algorithm is the interface to a class of hardware solutions which can * be addressed using the same bus algorithms - i.e. bit-banging or the PCF8584 * to name two of the most common. * * The return codes from the ``master_xfer{_atomic}`` fields should indicate the * type of error code that occurred during the transfer, as documented in the * Kernel Documentation file Documentation/i2c/fault-codes.rst. Otherwise, the * number of messages executed should be returned. */ struct i2c_algorithm { /* * If an adapter algorithm can't do I2C-level access, set master_xfer * to NULL. If an adapter algorithm can do SMBus access, set * smbus_xfer. If set to NULL, the SMBus protocol is simulated * using common I2C messages. * * master_xfer should return the number of messages successfully * processed, or a negative value on error */ int (*master_xfer)(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); int (*master_xfer_atomic)(struct i2c_adapter *adap, struct i2c_msg *msgs, int num); int (*smbus_xfer)(struct i2c_adapter *adap, u16 addr, unsigned short flags, char read_write, u8 command, int size, union i2c_smbus_data *data); int (*smbus_xfer_atomic)(struct i2c_adapter *adap, u16 addr, unsigned short flags, char read_write, u8 command, int size, union i2c_smbus_data *data); /* To determine what the adapter supports */ u32 (*functionality)(struct i2c_adapter *adap); #if IS_ENABLED(CONFIG_I2C_SLAVE) int (*reg_slave)(struct i2c_client *client); int (*unreg_slave)(struct i2c_client *client); #endif }; /** * struct i2c_lock_operations - represent I2C locking operations * @lock_bus: Get exclusive access to an I2C bus segment * @trylock_bus: Try to get exclusive access to an I2C bus segment * @unlock_bus: Release exclusive access to an I2C bus segment * * The main operations are wrapped by i2c_lock_bus and i2c_unlock_bus. */ struct i2c_lock_operations { void (*lock_bus)(struct i2c_adapter *adapter, unsigned int flags); int (*trylock_bus)(struct i2c_adapter *adapter, unsigned int flags); void (*unlock_bus)(struct i2c_adapter *adapter, unsigned int flags); }; /** * struct i2c_timings - I2C timing information * @bus_freq_hz: the bus frequency in Hz * @scl_rise_ns: time SCL signal takes to rise in ns; t(r) in the I2C specification * @scl_fall_ns: time SCL signal takes to fall in ns; t(f) in the I2C specification * @scl_int_delay_ns: time IP core additionally needs to setup SCL in ns * @sda_fall_ns: time SDA signal takes to fall in ns; t(f) in the I2C specification * @sda_hold_ns: time IP core additionally needs to hold SDA in ns * @digital_filter_width_ns: width in ns of spikes on i2c lines that the IP core * digital filter can filter out * @analog_filter_cutoff_freq_hz: threshold frequency for the low pass IP core * analog filter */ struct i2c_timings { u32 bus_freq_hz; u32 scl_rise_ns; u32 scl_fall_ns; u32 scl_int_delay_ns; u32 sda_fall_ns; u32 sda_hold_ns; u32 digital_filter_width_ns; u32 analog_filter_cutoff_freq_hz; }; /** * struct i2c_bus_recovery_info - I2C bus recovery information * @recover_bus: Recover routine. Either pass driver's recover_bus() routine, or * i2c_generic_scl_recovery(). * @get_scl: This gets current value of SCL line. Mandatory for generic SCL * recovery. Populated internally for generic GPIO recovery. * @set_scl: This sets/clears the SCL line. Mandatory for generic SCL recovery. * Populated internally for generic GPIO recovery. * @get_sda: This gets current value of SDA line. This or set_sda() is mandatory * for generic SCL recovery. Populated internally, if sda_gpio is a valid * GPIO, for generic GPIO recovery. * @set_sda: This sets/clears the SDA line. This or get_sda() is mandatory for * generic SCL recovery. Populated internally, if sda_gpio is a valid GPIO, * for generic GPIO recovery. * @get_bus_free: Returns the bus free state as seen from the IP core in case it * has a more complex internal logic than just reading SDA. Optional. * @prepare_recovery: This will be called before starting recovery. Platform may * configure padmux here for SDA/SCL line or something else they want. * @unprepare_recovery: This will be called after completing recovery. Platform * may configure padmux here for SDA/SCL line or something else they want. * @scl_gpiod: gpiod of the SCL line. Only required for GPIO recovery. * @sda_gpiod: gpiod of the SDA line. Only required for GPIO recovery. * @pinctrl: pinctrl used by GPIO recovery to change the state of the I2C pins. * Optional. * @pins_default: default pinctrl state of SCL/SDA lines, when they are assigned * to the I2C bus. Optional. Populated internally for GPIO recovery, if * state with the name PINCTRL_STATE_DEFAULT is found and pinctrl is valid. * @pins_gpio: recovery pinctrl state of SCL/SDA lines, when they are used as * GPIOs. Optional. Populated internally for GPIO recovery, if this state * is called "gpio" or "recovery" and pinctrl is valid. */ struct i2c_bus_recovery_info { int (*recover_bus)(struct i2c_adapter *adap); int (*get_scl)(struct i2c_adapter *adap); void (*set_scl)(struct i2c_adapter *adap, int val); int (*get_sda)(struct i2c_adapter *adap); void (*set_sda)(struct i2c_adapter *adap, int val); int (*get_bus_free)(struct i2c_adapter *adap); void (*prepare_recovery)(struct i2c_adapter *adap); void (*unprepare_recovery)(struct i2c_adapter *adap); /* gpio recovery */ struct gpio_desc *scl_gpiod; struct gpio_desc *sda_gpiod; struct pinctrl *pinctrl; struct pinctrl_state *pins_default; struct pinctrl_state *pins_gpio; }; int i2c_recover_bus(struct i2c_adapter *adap); /* Generic recovery routines */ int i2c_generic_scl_recovery(struct i2c_adapter *adap); /** * struct i2c_adapter_quirks - describe flaws of an i2c adapter * @flags: see I2C_AQ_* for possible flags and read below * @max_num_msgs: maximum number of messages per transfer * @max_write_len: maximum length of a write message * @max_read_len: maximum length of a read message * @max_comb_1st_msg_len: maximum length of the first msg in a combined message * @max_comb_2nd_msg_len: maximum length of the second msg in a combined message * * Note about combined messages: Some I2C controllers can only send one message * per transfer, plus something called combined message or write-then-read. * This is (usually) a small write message followed by a read message and * barely enough to access register based devices like EEPROMs. There is a flag * to support this mode. It implies max_num_msg = 2 and does the length checks * with max_comb_*_len because combined message mode usually has its own * limitations. Because of HW implementations, some controllers can actually do * write-then-anything or other variants. To support that, write-then-read has * been broken out into smaller bits like write-first and read-second which can * be combined as needed. */ struct i2c_adapter_quirks { u64 flags; int max_num_msgs; u16 max_write_len; u16 max_read_len; u16 max_comb_1st_msg_len; u16 max_comb_2nd_msg_len; }; /* enforce max_num_msgs = 2 and use max_comb_*_len for length checks */ #define I2C_AQ_COMB BIT(0) /* first combined message must be write */ #define I2C_AQ_COMB_WRITE_FIRST BIT(1) /* second combined message must be read */ #define I2C_AQ_COMB_READ_SECOND BIT(2) /* both combined messages must have the same target address */ #define I2C_AQ_COMB_SAME_ADDR BIT(3) /* convenience macro for typical write-then read case */ #define I2C_AQ_COMB_WRITE_THEN_READ (I2C_AQ_COMB | I2C_AQ_COMB_WRITE_FIRST | \ I2C_AQ_COMB_READ_SECOND | I2C_AQ_COMB_SAME_ADDR) /* clock stretching is not supported */ #define I2C_AQ_NO_CLK_STRETCH BIT(4) /* message cannot have length of 0 */ #define I2C_AQ_NO_ZERO_LEN_READ BIT(5) #define I2C_AQ_NO_ZERO_LEN_WRITE BIT(6) #define I2C_AQ_NO_ZERO_LEN (I2C_AQ_NO_ZERO_LEN_READ | I2C_AQ_NO_ZERO_LEN_WRITE) /* adapter cannot do repeated START */ #define I2C_AQ_NO_REP_START BIT(7) /* * i2c_adapter is the structure used to identify a physical i2c bus along * with the access algorithms necessary to access it. */ struct i2c_adapter { struct module *owner; unsigned int class; /* classes to allow probing for */ const struct i2c_algorithm *algo; /* the algorithm to access the bus */ void *algo_data; /* data fields that are valid for all devices */ const struct i2c_lock_operations *lock_ops; struct rt_mutex bus_lock; struct rt_mutex mux_lock; int timeout; /* in jiffies */ int retries; struct device dev; /* the adapter device */ unsigned long locked_flags; /* owned by the I2C core */ #define I2C_ALF_IS_SUSPENDED 0 #define I2C_ALF_SUSPEND_REPORTED 1 int nr; char name[48]; struct completion dev_released; struct mutex userspace_clients_lock; struct list_head userspace_clients; struct i2c_bus_recovery_info *bus_recovery_info; const struct i2c_adapter_quirks *quirks; struct irq_domain *host_notify_domain; struct regulator *bus_regulator; struct dentry *debugfs; }; #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev) static inline void *i2c_get_adapdata(const struct i2c_adapter *adap) { return dev_get_drvdata(&adap->dev); } static inline void i2c_set_adapdata(struct i2c_adapter *adap, void *data) { dev_set_drvdata(&adap->dev, data); } static inline struct i2c_adapter * i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter) { #if IS_ENABLED(CONFIG_I2C_MUX) struct device *parent = adapter->dev.parent; if (parent != NULL && parent->type == &i2c_adapter_type) return to_i2c_adapter(parent); else #endif return NULL; } int i2c_for_each_dev(void *data, int (*fn)(struct device *dev, void *data)); /* Adapter locking functions, exported for shared pin cases */ #define I2C_LOCK_ROOT_ADAPTER BIT(0) #define I2C_LOCK_SEGMENT BIT(1) /** * i2c_lock_bus - Get exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER locks the root i2c adapter, I2C_LOCK_SEGMENT * locks only this branch in the adapter tree */ static inline void i2c_lock_bus(struct i2c_adapter *adapter, unsigned int flags) { adapter->lock_ops->lock_bus(adapter, flags); } /** * i2c_trylock_bus - Try to get exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER tries to locks the root i2c adapter, * I2C_LOCK_SEGMENT tries to lock only this branch in the adapter tree * * Return: true if the I2C bus segment is locked, false otherwise */ static inline int i2c_trylock_bus(struct i2c_adapter *adapter, unsigned int flags) { return adapter->lock_ops->trylock_bus(adapter, flags); } /** * i2c_unlock_bus - Release exclusive access to an I2C bus segment * @adapter: Target I2C bus segment * @flags: I2C_LOCK_ROOT_ADAPTER unlocks the root i2c adapter, I2C_LOCK_SEGMENT * unlocks only this branch in the adapter tree */ static inline void i2c_unlock_bus(struct i2c_adapter *adapter, unsigned int flags) { adapter->lock_ops->unlock_bus(adapter, flags); } /** * i2c_mark_adapter_suspended - Report suspended state of the adapter to the core * @adap: Adapter to mark as suspended * * When using this helper to mark an adapter as suspended, the core will reject * further transfers to this adapter. The usage of this helper is optional but * recommended for devices having distinct handlers for system suspend and * runtime suspend. More complex devices are free to implement custom solutions * to reject transfers when suspended. */ static inline void i2c_mark_adapter_suspended(struct i2c_adapter *adap) { i2c_lock_bus(adap, I2C_LOCK_ROOT_ADAPTER); set_bit(I2C_ALF_IS_SUSPENDED, &adap->locked_flags); i2c_unlock_bus(adap, I2C_LOCK_ROOT_ADAPTER); } /** * i2c_mark_adapter_resumed - Report resumed state of the adapter to the core * @adap: Adapter to mark as resumed * * When using this helper to mark an adapter as resumed, the core will allow * further transfers to this adapter. See also further notes to * @i2c_mark_adapter_suspended(). */ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap) { i2c_lock_bus(adap, I2C_LOCK_ROOT_ADAPTER); clear_bit(I2C_ALF_IS_SUSPENDED, &adap->locked_flags); i2c_unlock_bus(adap, I2C_LOCK_ROOT_ADAPTER); } /* i2c adapter classes (bitmask) */ #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ #define I2C_CLASS_SPD (1<<7) /* Memory modules */ /* Warn users that the adapter doesn't support classes anymore */ #define I2C_CLASS_DEPRECATED (1<<8) /* Internal numbers to terminate lists */ #define I2C_CLIENT_END 0xfffeU /* Construct an I2C_CLIENT_END-terminated array of i2c addresses */ #define I2C_ADDRS(addr, addrs...) \ ((const unsigned short []){ addr, ## addrs, I2C_CLIENT_END }) /* ----- functions exported by i2c.o */ /* administration... */ #if IS_ENABLED(CONFIG_I2C) int i2c_add_adapter(struct i2c_adapter *adap); int devm_i2c_add_adapter(struct device *dev, struct i2c_adapter *adapter); void i2c_del_adapter(struct i2c_adapter *adap); int i2c_add_numbered_adapter(struct i2c_adapter *adap); int i2c_register_driver(struct module *owner, struct i2c_driver *driver); void i2c_del_driver(struct i2c_driver *driver); /* use a define to avoid include chaining to get THIS_MODULE */ #define i2c_add_driver(driver) \ i2c_register_driver(THIS_MODULE, driver) static inline bool i2c_client_has_driver(struct i2c_client *client) { return !IS_ERR_OR_NULL(client) && client->dev.driver; } /* call the i2c_client->command() of all attached clients with * the given arguments */ void i2c_clients_command(struct i2c_adapter *adap, unsigned int cmd, void *arg); struct i2c_adapter *i2c_get_adapter(int nr); void i2c_put_adapter(struct i2c_adapter *adap); unsigned int i2c_adapter_depth(struct i2c_adapter *adapter); void i2c_parse_fw_timings(struct device *dev, struct i2c_timings *t, bool use_defaults); /* Return the functionality mask */ static inline u32 i2c_get_functionality(struct i2c_adapter *adap) { return adap->algo->functionality(adap); } /* Return 1 if adapter supports everything we need, 0 if not. */ static inline int i2c_check_functionality(struct i2c_adapter *adap, u32 func) { return (func & i2c_get_functionality(adap)) == func; } /** * i2c_check_quirks() - Function for checking the quirk flags in an i2c adapter * @adap: i2c adapter * @quirks: quirk flags * * Return: true if the adapter has all the specified quirk flags, false if not */ static inline bool i2c_check_quirks(struct i2c_adapter *adap, u64 quirks) { if (!adap->quirks) return false; return (adap->quirks->flags & quirks) == quirks; } /* Return the adapter number for a specific adapter */ static inline int i2c_adapter_id(struct i2c_adapter *adap) { return adap->nr; } static inline u8 i2c_8bit_addr_from_msg(const struct i2c_msg *msg) { return (msg->addr << 1) | (msg->flags & I2C_M_RD ? 1 : 0); } u8 *i2c_get_dma_safe_msg_buf(struct i2c_msg *msg, unsigned int threshold); void i2c_put_dma_safe_msg_buf(u8 *buf, struct i2c_msg *msg, bool xferred); int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr); /** * module_i2c_driver() - Helper macro for registering a modular I2C driver * @__i2c_driver: i2c_driver struct * * Helper macro for I2C drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_i2c_driver(__i2c_driver) \ module_driver(__i2c_driver, i2c_add_driver, \ i2c_del_driver) /** * builtin_i2c_driver() - Helper macro for registering a builtin I2C driver * @__i2c_driver: i2c_driver struct * * Helper macro for I2C drivers which do not do anything special in their * init. This eliminates a lot of boilerplate. Each driver may only * use this macro once, and calling it replaces device_initcall(). */ #define builtin_i2c_driver(__i2c_driver) \ builtin_driver(__i2c_driver, i2c_add_driver) #endif /* I2C */ /* must call put_device() when done with returned i2c_client device */ struct i2c_client *i2c_find_device_by_fwnode(struct fwnode_handle *fwnode); /* must call put_device() when done with returned i2c_adapter device */ struct i2c_adapter *i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode); /* must call i2c_put_adapter() when done with returned i2c_adapter device */ struct i2c_adapter *i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode); #if IS_ENABLED(CONFIG_OF) /* must call put_device() when done with returned i2c_client device */ static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node) { return i2c_find_device_by_fwnode(of_fwnode_handle(node)); } /* must call put_device() when done with returned i2c_adapter device */ static inline struct i2c_adapter *of_find_i2c_adapter_by_node(struct device_node *node) { return i2c_find_adapter_by_fwnode(of_fwnode_handle(node)); } /* must call i2c_put_adapter() when done with returned i2c_adapter device */ static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node *node) { return i2c_get_adapter_by_fwnode(of_fwnode_handle(node)); } const struct of_device_id *i2c_of_match_device(const struct of_device_id *matches, struct i2c_client *client); int of_i2c_get_board_info(struct device *dev, struct device_node *node, struct i2c_board_info *info); #else static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node) { return NULL; } static inline struct i2c_adapter *of_find_i2c_adapter_by_node(struct device_node *node) { return NULL; } static inline struct i2c_adapter *of_get_i2c_adapter_by_node(struct device_node *node) { return NULL; } static inline const struct of_device_id *i2c_of_match_device(const struct of_device_id *matches, struct i2c_client *client) { return NULL; } static inline int of_i2c_get_board_info(struct device *dev, struct device_node *node, struct i2c_board_info *info) { return -ENOTSUPP; } #endif /* CONFIG_OF */ struct acpi_resource; struct acpi_resource_i2c_serialbus; #if IS_ENABLED(CONFIG_ACPI) bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c); int i2c_acpi_client_count(struct acpi_device *adev); u32 i2c_acpi_find_bus_speed(struct device *dev); struct i2c_client *i2c_acpi_new_device_by_fwnode(struct fwnode_handle *fwnode, int index, struct i2c_board_info *info); struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle); bool i2c_acpi_waive_d0_probe(struct device *dev); #else static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c) { return false; } static inline int i2c_acpi_client_count(struct acpi_device *adev) { return 0; } static inline u32 i2c_acpi_find_bus_speed(struct device *dev) { return 0; } static inline struct i2c_client *i2c_acpi_new_device_by_fwnode( struct fwnode_handle *fwnode, int index, struct i2c_board_info *info) { return ERR_PTR(-ENODEV); } static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle) { return NULL; } static inline bool i2c_acpi_waive_d0_probe(struct device *dev) { return false; } #endif /* CONFIG_ACPI */ static inline struct i2c_client *i2c_acpi_new_device(struct device *dev, int index, struct i2c_board_info *info) { return i2c_acpi_new_device_by_fwnode(dev_fwnode(dev), index, info); } #endif /* _LINUX_I2C_H */
3 1 2 13 13 5 9 1 1 3 1 1 1 1 1 1 11 3 4 4 7 3 10 4 4 4 2 2 2 2 2 2 4 1 2 3 4 2 2 2 2 1 1 1 2 29 22 7 2 2 2 2 2 6 2 8 8 33 33 33 33 34 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/fs_context.c * * Copyright (C) 1992 Rick Sladkey * Conversion to new mount api Copyright (C) David Howells * * NFS mount handling. * * Split from fs/nfs/super.c by David Howells <dhowells@redhat.com> */ #include <linux/compat.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <net/handshake.h> #include "nfs.h" #include "internal.h" #include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_MOUNT #if IS_ENABLED(CONFIG_NFS_V3) #define NFS_DEFAULT_VERSION 3 #else #define NFS_DEFAULT_VERSION 2 #endif #define NFS_MAX_CONNECTIONS 16 enum nfs_param { Opt_ac, Opt_acdirmax, Opt_acdirmin, Opt_acl, Opt_acregmax, Opt_acregmin, Opt_actimeo, Opt_addr, Opt_bg, Opt_bsize, Opt_clientaddr, Opt_cto, Opt_fg, Opt_fscache, Opt_fscache_flag, Opt_hard, Opt_intr, Opt_local_lock, Opt_lock, Opt_lookupcache, Opt_migration, Opt_minorversion, Opt_mountaddr, Opt_mounthost, Opt_mountport, Opt_mountproto, Opt_mountvers, Opt_namelen, Opt_nconnect, Opt_max_connect, Opt_port, Opt_posix, Opt_proto, Opt_rdirplus, Opt_rdma, Opt_resvport, Opt_retrans, Opt_retry, Opt_rsize, Opt_sec, Opt_sharecache, Opt_sloppy, Opt_soft, Opt_softerr, Opt_softreval, Opt_source, Opt_tcp, Opt_timeo, Opt_trunkdiscovery, Opt_udp, Opt_v, Opt_vers, Opt_wsize, Opt_write, Opt_xprtsec, }; enum { Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_none, Opt_local_lock_posix, }; static const struct constant_table nfs_param_enums_local_lock[] = { { "all", Opt_local_lock_all }, { "flock", Opt_local_lock_flock }, { "posix", Opt_local_lock_posix }, { "none", Opt_local_lock_none }, {} }; enum { Opt_lookupcache_all, Opt_lookupcache_none, Opt_lookupcache_positive, }; static const struct constant_table nfs_param_enums_lookupcache[] = { { "all", Opt_lookupcache_all }, { "none", Opt_lookupcache_none }, { "pos", Opt_lookupcache_positive }, { "positive", Opt_lookupcache_positive }, {} }; enum { Opt_write_lazy, Opt_write_eager, Opt_write_wait, }; static const struct constant_table nfs_param_enums_write[] = { { "lazy", Opt_write_lazy }, { "eager", Opt_write_eager }, { "wait", Opt_write_wait }, {} }; static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_flag_no("ac", Opt_ac), fsparam_u32 ("acdirmax", Opt_acdirmax), fsparam_u32 ("acdirmin", Opt_acdirmin), fsparam_flag_no("acl", Opt_acl), fsparam_u32 ("acregmax", Opt_acregmax), fsparam_u32 ("acregmin", Opt_acregmin), fsparam_u32 ("actimeo", Opt_actimeo), fsparam_string("addr", Opt_addr), fsparam_flag ("bg", Opt_bg), fsparam_u32 ("bsize", Opt_bsize), fsparam_string("clientaddr", Opt_clientaddr), fsparam_flag_no("cto", Opt_cto), fsparam_flag ("fg", Opt_fg), fsparam_flag_no("fsc", Opt_fscache_flag), fsparam_string("fsc", Opt_fscache), fsparam_flag ("hard", Opt_hard), __fsparam(NULL, "intr", Opt_intr, fs_param_neg_with_no|fs_param_deprecated, NULL), fsparam_enum ("local_lock", Opt_local_lock, nfs_param_enums_local_lock), fsparam_flag_no("lock", Opt_lock), fsparam_enum ("lookupcache", Opt_lookupcache, nfs_param_enums_lookupcache), fsparam_flag_no("migration", Opt_migration), fsparam_u32 ("minorversion", Opt_minorversion), fsparam_string("mountaddr", Opt_mountaddr), fsparam_string("mounthost", Opt_mounthost), fsparam_u32 ("mountport", Opt_mountport), fsparam_string("mountproto", Opt_mountproto), fsparam_u32 ("mountvers", Opt_mountvers), fsparam_u32 ("namlen", Opt_namelen), fsparam_u32 ("nconnect", Opt_nconnect), fsparam_u32 ("max_connect", Opt_max_connect), fsparam_string("nfsvers", Opt_vers), fsparam_u32 ("port", Opt_port), fsparam_flag_no("posix", Opt_posix), fsparam_string("proto", Opt_proto), fsparam_flag_no("rdirplus", Opt_rdirplus), fsparam_flag ("rdma", Opt_rdma), fsparam_flag_no("resvport", Opt_resvport), fsparam_u32 ("retrans", Opt_retrans), fsparam_string("retry", Opt_retry), fsparam_u32 ("rsize", Opt_rsize), fsparam_string("sec", Opt_sec), fsparam_flag_no("sharecache", Opt_sharecache), fsparam_flag ("sloppy", Opt_sloppy), fsparam_flag ("soft", Opt_soft), fsparam_flag ("softerr", Opt_softerr), fsparam_flag ("softreval", Opt_softreval), fsparam_string("source", Opt_source), fsparam_flag ("tcp", Opt_tcp), fsparam_u32 ("timeo", Opt_timeo), fsparam_flag_no("trunkdiscovery", Opt_trunkdiscovery), fsparam_flag ("udp", Opt_udp), fsparam_flag ("v2", Opt_v), fsparam_flag ("v3", Opt_v), fsparam_flag ("v4", Opt_v), fsparam_flag ("v4.0", Opt_v), fsparam_flag ("v4.1", Opt_v), fsparam_flag ("v4.2", Opt_v), fsparam_string("vers", Opt_vers), fsparam_enum ("write", Opt_write, nfs_param_enums_write), fsparam_u32 ("wsize", Opt_wsize), fsparam_string("xprtsec", Opt_xprtsec), {} }; enum { Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, Opt_vers_4_1, Opt_vers_4_2, }; static const struct constant_table nfs_vers_tokens[] = { { "2", Opt_vers_2 }, { "3", Opt_vers_3 }, { "4", Opt_vers_4 }, { "4.0", Opt_vers_4_0 }, { "4.1", Opt_vers_4_1 }, { "4.2", Opt_vers_4_2 }, {} }; enum { Opt_xprt_rdma, Opt_xprt_rdma6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_udp, Opt_xprt_udp6, nr__Opt_xprt }; static const struct constant_table nfs_xprt_protocol_tokens[] = { { "rdma", Opt_xprt_rdma }, { "rdma6", Opt_xprt_rdma6 }, { "tcp", Opt_xprt_tcp }, { "tcp6", Opt_xprt_tcp6 }, { "udp", Opt_xprt_udp }, { "udp6", Opt_xprt_udp6 }, {} }; enum { Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, Opt_sec_lkey, Opt_sec_lkeyi, Opt_sec_lkeyp, Opt_sec_none, Opt_sec_spkm, Opt_sec_spkmi, Opt_sec_spkmp, Opt_sec_sys, nr__Opt_sec }; static const struct constant_table nfs_secflavor_tokens[] = { { "krb5", Opt_sec_krb5 }, { "krb5i", Opt_sec_krb5i }, { "krb5p", Opt_sec_krb5p }, { "lkey", Opt_sec_lkey }, { "lkeyi", Opt_sec_lkeyi }, { "lkeyp", Opt_sec_lkeyp }, { "none", Opt_sec_none }, { "null", Opt_sec_none }, { "spkm3", Opt_sec_spkm }, { "spkm3i", Opt_sec_spkmi }, { "spkm3p", Opt_sec_spkmp }, { "sys", Opt_sec_sys }, {} }; enum { Opt_xprtsec_none, Opt_xprtsec_tls, Opt_xprtsec_mtls, nr__Opt_xprtsec }; static const struct constant_table nfs_xprtsec_policies[] = { { "none", Opt_xprtsec_none }, { "tls", Opt_xprtsec_tls }, { "mtls", Opt_xprtsec_mtls }, {} }; /* * Sanity-check a server address provided by the mount command. * * Address family must be initialized, and address must not be * the ANY address for that family. */ static int nfs_verify_server_address(struct sockaddr_storage *addr) { switch (addr->ss_family) { case AF_INET: { struct sockaddr_in *sa = (struct sockaddr_in *)addr; return sa->sin_addr.s_addr != htonl(INADDR_ANY); } case AF_INET6: { struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr; return !ipv6_addr_any(sa); } } return 0; } #ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx) { return true; } #else static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx) { if (ctx->version == 4) return true; return false; } #endif /* * Sanity check the NFS transport protocol. */ static int nfs_validate_transport_protocol(struct fs_context *fc, struct nfs_fs_context *ctx) { switch (ctx->nfs_server.protocol) { case XPRT_TRANSPORT_UDP: if (nfs_server_transport_udp_invalid(ctx)) goto out_invalid_transport_udp; break; case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: break; default: ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; } if (ctx->xprtsec.policy != RPC_XPRTSEC_NONE) switch (ctx->nfs_server.protocol) { case XPRT_TRANSPORT_TCP: ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP_TLS; break; default: goto out_invalid_xprtsec_policy; } return 0; out_invalid_transport_udp: return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); out_invalid_xprtsec_policy: return nfs_invalf(fc, "NFS: Transport does not support xprtsec"); } /* * For text based NFSv2/v3 mounts, the mount protocol transport default * settings should depend upon the specified NFS transport. */ static void nfs_set_mount_transport_protocol(struct nfs_fs_context *ctx) { if (ctx->mount_server.protocol == XPRT_TRANSPORT_UDP || ctx->mount_server.protocol == XPRT_TRANSPORT_TCP) return; switch (ctx->nfs_server.protocol) { case XPRT_TRANSPORT_UDP: ctx->mount_server.protocol = XPRT_TRANSPORT_UDP; break; case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: ctx->mount_server.protocol = XPRT_TRANSPORT_TCP; } } /* * Add 'flavor' to 'auth_info' if not already present. * Returns true if 'flavor' ends up in the list, false otherwise */ static int nfs_auth_info_add(struct fs_context *fc, struct nfs_auth_info *auth_info, rpc_authflavor_t flavor) { unsigned int i; unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors); /* make sure this flavor isn't already in the list */ for (i = 0; i < auth_info->flavor_len; i++) { if (flavor == auth_info->flavors[i]) return 0; } if (auth_info->flavor_len + 1 >= max_flavor_len) return nfs_invalf(fc, "NFS: too many sec= flavors"); auth_info->flavors[auth_info->flavor_len++] = flavor; return 0; } /* * Parse the value of the 'sec=' option. */ static int nfs_parse_security_flavors(struct fs_context *fc, struct fs_parameter *param) { struct nfs_fs_context *ctx = nfs_fc2context(fc); rpc_authflavor_t pseudoflavor; char *string = param->string, *p; int ret; trace_nfs_mount_assign(param->key, string); while ((p = strsep(&string, ":")) != NULL) { if (!*p) continue; switch (lookup_constant(nfs_secflavor_tokens, p, -1)) { case Opt_sec_none: pseudoflavor = RPC_AUTH_NULL; break; case Opt_sec_sys: pseudoflavor = RPC_AUTH_UNIX; break; case Opt_sec_krb5: pseudoflavor = RPC_AUTH_GSS_KRB5; break; case Opt_sec_krb5i: pseudoflavor = RPC_AUTH_GSS_KRB5I; break; case Opt_sec_krb5p: pseudoflavor = RPC_AUTH_GSS_KRB5P; break; case Opt_sec_lkey: pseudoflavor = RPC_AUTH_GSS_LKEY; break; case Opt_sec_lkeyi: pseudoflavor = RPC_AUTH_GSS_LKEYI; break; case Opt_sec_lkeyp: pseudoflavor = RPC_AUTH_GSS_LKEYP; break; case Opt_sec_spkm: pseudoflavor = RPC_AUTH_GSS_SPKM; break; case Opt_sec_spkmi: pseudoflavor = RPC_AUTH_GSS_SPKMI; break; case Opt_sec_spkmp: pseudoflavor = RPC_AUTH_GSS_SPKMP; break; default: return nfs_invalf(fc, "NFS: sec=%s option not recognized", p); } ret = nfs_auth_info_add(fc, &ctx->auth_info, pseudoflavor); if (ret < 0) return ret; } return 0; } static int nfs_parse_xprtsec_policy(struct fs_context *fc, struct fs_parameter *param) { struct nfs_fs_context *ctx = nfs_fc2context(fc); trace_nfs_mount_assign(param->key, param->string); switch (lookup_constant(nfs_xprtsec_policies, param->string, -1)) { case Opt_xprtsec_none: ctx->xprtsec.policy = RPC_XPRTSEC_NONE; break; case Opt_xprtsec_tls: ctx->xprtsec.policy = RPC_XPRTSEC_TLS_ANON; break; case Opt_xprtsec_mtls: ctx->xprtsec.policy = RPC_XPRTSEC_TLS_X509; break; default: return nfs_invalf(fc, "NFS: Unrecognized transport security policy"); } return 0; } static int nfs_parse_version_string(struct fs_context *fc, const char *string) { struct nfs_fs_context *ctx = nfs_fc2context(fc); ctx->flags &= ~NFS_MOUNT_VER3; switch (lookup_constant(nfs_vers_tokens, string, -1)) { case Opt_vers_2: ctx->version = 2; break; case Opt_vers_3: ctx->flags |= NFS_MOUNT_VER3; ctx->version = 3; break; case Opt_vers_4: /* Backward compatibility option. In future, * the mount program should always supply * a NFSv4 minor version number. */ ctx->version = 4; break; case Opt_vers_4_0: ctx->version = 4; ctx->minorversion = 0; break; case Opt_vers_4_1: ctx->version = 4; ctx->minorversion = 1; break; case Opt_vers_4_2: ctx->version = 4; ctx->minorversion = 2; break; default: return nfs_invalf(fc, "NFS: Unsupported NFS version"); } return 0; } /* * Parse a single mount parameter. */ static int nfs_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; struct nfs_fs_context *ctx = nfs_fc2context(fc); unsigned short protofamily, mountfamily; unsigned int len; int ret, opt; trace_nfs_mount_option(param); opt = fs_parse(fc, nfs_fs_parameters, param, &result); if (opt < 0) return (opt == -ENOPARAM && ctx->sloppy) ? 1 : opt; if (fc->security) ctx->has_sec_mnt_opts = 1; switch (opt) { case Opt_source: if (fc->source) return nfs_invalf(fc, "NFS: Multiple sources not supported"); fc->source = param->string; param->string = NULL; break; /* * boolean options: foo/nofoo */ case Opt_soft: ctx->flags |= NFS_MOUNT_SOFT; ctx->flags &= ~NFS_MOUNT_SOFTERR; break; case Opt_softerr: ctx->flags |= NFS_MOUNT_SOFTERR | NFS_MOUNT_SOFTREVAL; ctx->flags &= ~NFS_MOUNT_SOFT; break; case Opt_hard: ctx->flags &= ~(NFS_MOUNT_SOFT | NFS_MOUNT_SOFTERR | NFS_MOUNT_SOFTREVAL); break; case Opt_softreval: if (result.negated) ctx->flags &= ~NFS_MOUNT_SOFTREVAL; else ctx->flags |= NFS_MOUNT_SOFTREVAL; break; case Opt_posix: if (result.negated) ctx->flags &= ~NFS_MOUNT_POSIX; else ctx->flags |= NFS_MOUNT_POSIX; break; case Opt_cto: if (result.negated) ctx->flags |= NFS_MOUNT_NOCTO; else ctx->flags &= ~NFS_MOUNT_NOCTO; break; case Opt_trunkdiscovery: if (result.negated) ctx->flags &= ~NFS_MOUNT_TRUNK_DISCOVERY; else ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY; break; case Opt_ac: if (result.negated) ctx->flags |= NFS_MOUNT_NOAC; else ctx->flags &= ~NFS_MOUNT_NOAC; break; case Opt_lock: if (result.negated) { ctx->flags |= NFS_MOUNT_NONLM; ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); } else { ctx->flags &= ~NFS_MOUNT_NONLM; ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); } break; case Opt_udp: ctx->flags &= ~NFS_MOUNT_TCP; ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_tcp: case Opt_rdma: ctx->flags |= NFS_MOUNT_TCP; /* for side protocols */ ret = xprt_find_transport_ident(param->key); if (ret < 0) goto out_bad_transport; ctx->nfs_server.protocol = ret; break; case Opt_acl: if (result.negated) ctx->flags |= NFS_MOUNT_NOACL; else ctx->flags &= ~NFS_MOUNT_NOACL; break; case Opt_rdirplus: if (result.negated) ctx->flags |= NFS_MOUNT_NORDIRPLUS; else ctx->flags &= ~NFS_MOUNT_NORDIRPLUS; break; case Opt_sharecache: if (result.negated) ctx->flags |= NFS_MOUNT_UNSHARED; else ctx->flags &= ~NFS_MOUNT_UNSHARED; break; case Opt_resvport: if (result.negated) ctx->flags |= NFS_MOUNT_NORESVPORT; else ctx->flags &= ~NFS_MOUNT_NORESVPORT; break; case Opt_fscache_flag: if (result.negated) ctx->options &= ~NFS_OPTION_FSCACHE; else ctx->options |= NFS_OPTION_FSCACHE; kfree(ctx->fscache_uniq); ctx->fscache_uniq = NULL; break; case Opt_fscache: ctx->options |= NFS_OPTION_FSCACHE; kfree(ctx->fscache_uniq); ctx->fscache_uniq = param->string; param->string = NULL; break; case Opt_migration: if (result.negated) ctx->options &= ~NFS_OPTION_MIGRATION; else ctx->options |= NFS_OPTION_MIGRATION; break; /* * options that take numeric values */ case Opt_port: if (result.uint_32 > USHRT_MAX) goto out_of_bounds; ctx->nfs_server.port = result.uint_32; break; case Opt_rsize: ctx->rsize = result.uint_32; break; case Opt_wsize: ctx->wsize = result.uint_32; break; case Opt_bsize: ctx->bsize = result.uint_32; break; case Opt_timeo: if (result.uint_32 < 1 || result.uint_32 > INT_MAX) goto out_of_bounds; ctx->timeo = result.uint_32; break; case Opt_retrans: if (result.uint_32 > INT_MAX) goto out_of_bounds; ctx->retrans = result.uint_32; break; case Opt_acregmin: ctx->acregmin = result.uint_32; break; case Opt_acregmax: ctx->acregmax = result.uint_32; break; case Opt_acdirmin: ctx->acdirmin = result.uint_32; break; case Opt_acdirmax: ctx->acdirmax = result.uint_32; break; case Opt_actimeo: ctx->acregmin = result.uint_32; ctx->acregmax = result.uint_32; ctx->acdirmin = result.uint_32; ctx->acdirmax = result.uint_32; break; case Opt_namelen: ctx->namlen = result.uint_32; break; case Opt_mountport: if (result.uint_32 > USHRT_MAX) goto out_of_bounds; ctx->mount_server.port = result.uint_32; break; case Opt_mountvers: if (result.uint_32 < NFS_MNT_VERSION || result.uint_32 > NFS_MNT3_VERSION) goto out_of_bounds; ctx->mount_server.version = result.uint_32; break; case Opt_minorversion: if (result.uint_32 > NFS4_MAX_MINOR_VERSION) goto out_of_bounds; ctx->minorversion = result.uint_32; break; /* * options that take text values */ case Opt_v: ret = nfs_parse_version_string(fc, param->key + 1); if (ret < 0) return ret; break; case Opt_vers: if (!param->string) goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); ret = nfs_parse_version_string(fc, param->string); if (ret < 0) return ret; break; case Opt_sec: ret = nfs_parse_security_flavors(fc, param); if (ret < 0) return ret; break; case Opt_xprtsec: ret = nfs_parse_xprtsec_policy(fc, param); if (ret < 0) return ret; break; case Opt_proto: if (!param->string) goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); protofamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: protofamily = AF_INET6; fallthrough; case Opt_xprt_udp: ctx->flags &= ~NFS_MOUNT_TCP; ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: protofamily = AF_INET6; fallthrough; case Opt_xprt_tcp: ctx->flags |= NFS_MOUNT_TCP; ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; break; case Opt_xprt_rdma6: protofamily = AF_INET6; fallthrough; case Opt_xprt_rdma: /* vector side protocols to TCP */ ctx->flags |= NFS_MOUNT_TCP; ret = xprt_find_transport_ident(param->string); if (ret < 0) goto out_bad_transport; ctx->nfs_server.protocol = ret; break; default: goto out_bad_transport; } ctx->protofamily = protofamily; break; case Opt_mountproto: if (!param->string) goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); mountfamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: mountfamily = AF_INET6; fallthrough; case Opt_xprt_udp: ctx->mount_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: mountfamily = AF_INET6; fallthrough; case Opt_xprt_tcp: ctx->mount_server.protocol = XPRT_TRANSPORT_TCP; break; case Opt_xprt_rdma: /* not used for side protocols */ default: goto out_bad_transport; } ctx->mountfamily = mountfamily; break; case Opt_addr: trace_nfs_mount_assign(param->key, param->string); len = rpc_pton(fc->net_ns, param->string, param->size, &ctx->nfs_server.address, sizeof(ctx->nfs_server._address)); if (len == 0) goto out_invalid_address; ctx->nfs_server.addrlen = len; break; case Opt_clientaddr: trace_nfs_mount_assign(param->key, param->string); kfree(ctx->client_address); ctx->client_address = param->string; param->string = NULL; break; case Opt_mounthost: trace_nfs_mount_assign(param->key, param->string); kfree(ctx->mount_server.hostname); ctx->mount_server.hostname = param->string; param->string = NULL; break; case Opt_mountaddr: trace_nfs_mount_assign(param->key, param->string); len = rpc_pton(fc->net_ns, param->string, param->size, &ctx->mount_server.address, sizeof(ctx->mount_server._address)); if (len == 0) goto out_invalid_address; ctx->mount_server.addrlen = len; break; case Opt_nconnect: trace_nfs_mount_assign(param->key, param->string); if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_CONNECTIONS) goto out_of_bounds; ctx->nfs_server.nconnect = result.uint_32; break; case Opt_max_connect: trace_nfs_mount_assign(param->key, param->string); if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_TRANSPORTS) goto out_of_bounds; ctx->nfs_server.max_connect = result.uint_32; break; case Opt_lookupcache: trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { case Opt_lookupcache_all: ctx->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); break; case Opt_lookupcache_positive: ctx->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; ctx->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; break; case Opt_lookupcache_none: ctx->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; break; default: goto out_invalid_value; } break; case Opt_local_lock: trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { case Opt_local_lock_all: ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); break; case Opt_local_lock_flock: ctx->flags |= NFS_MOUNT_LOCAL_FLOCK; break; case Opt_local_lock_posix: ctx->flags |= NFS_MOUNT_LOCAL_FCNTL; break; case Opt_local_lock_none: ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); break; default: goto out_invalid_value; } break; case Opt_write: trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { case Opt_write_lazy: ctx->flags &= ~(NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT); break; case Opt_write_eager: ctx->flags |= NFS_MOUNT_WRITE_EAGER; ctx->flags &= ~NFS_MOUNT_WRITE_WAIT; break; case Opt_write_wait: ctx->flags |= NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT; break; default: goto out_invalid_value; } break; /* * Special options */ case Opt_sloppy: ctx->sloppy = true; break; } return 0; out_invalid_value: return nfs_invalf(fc, "NFS: Bad mount option value specified"); out_invalid_address: return nfs_invalf(fc, "NFS: Bad IP address specified"); out_of_bounds: return nfs_invalf(fc, "NFS: Value for '%s' out of range", param->key); out_bad_transport: return nfs_invalf(fc, "NFS: Unrecognized transport protocol"); } /* * Split fc->source into "hostname:export_path". * * The leftmost colon demarks the split between the server's hostname * and the export path. If the hostname starts with a left square * bracket, then it may contain colons. * * Note: caller frees hostname and export path, even on error. */ static int nfs_parse_source(struct fs_context *fc, size_t maxnamlen, size_t maxpathlen) { struct nfs_fs_context *ctx = nfs_fc2context(fc); const char *dev_name = fc->source; size_t len; const char *end; if (unlikely(!dev_name || !*dev_name)) return -EINVAL; /* Is the host name protected with square brakcets? */ if (*dev_name == '[') { end = strchr(++dev_name, ']'); if (end == NULL || end[1] != ':') goto out_bad_devname; len = end - dev_name; end++; } else { const char *comma; end = strchr(dev_name, ':'); if (end == NULL) goto out_bad_devname; len = end - dev_name; /* kill possible hostname list: not supported */ comma = memchr(dev_name, ',', len); if (comma) len = comma - dev_name; } if (len > maxnamlen) goto out_hostname; kfree(ctx->nfs_server.hostname); /* N.B. caller will free nfs_server.hostname in all cases */ ctx->nfs_server.hostname = kmemdup_nul(dev_name, len, GFP_KERNEL); if (!ctx->nfs_server.hostname) goto out_nomem; len = strlen(++end); if (len > maxpathlen) goto out_path; ctx->nfs_server.export_path = kmemdup_nul(end, len, GFP_KERNEL); if (!ctx->nfs_server.export_path) goto out_nomem; trace_nfs_mount_path(ctx->nfs_server.export_path); return 0; out_bad_devname: return nfs_invalf(fc, "NFS: device name not in host:path format"); out_nomem: nfs_errorf(fc, "NFS: not enough memory to parse device name"); return -ENOMEM; out_hostname: nfs_errorf(fc, "NFS: server hostname too long"); return -ENAMETOOLONG; out_path: nfs_errorf(fc, "NFS: export pathname too long"); return -ENAMETOOLONG; } static inline bool is_remount_fc(struct fs_context *fc) { return fc->root != NULL; } /* * Parse monolithic NFS2/NFS3 mount data * - fills in the mount root filehandle * * For option strings, user space handles the following behaviors: * * + DNS: mapping server host name to IP address ("addr=" option) * * + failure mode: how to behave if a mount request can't be handled * immediately ("fg/bg" option) * * + retry: how often to retry a mount request ("retry=" option) * * + breaking back: trying proto=udp after proto=tcp, v2 after v3, * mountproto=tcp after mountproto=udp, and so on */ static int nfs23_parse_monolithic(struct fs_context *fc, struct nfs_mount_data *data) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_fh *mntfh = ctx->mntfh; struct sockaddr_storage *sap = &ctx->nfs_server._address; int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; int ret; if (data == NULL) goto out_no_data; ctx->version = NFS_DEFAULT_VERSION; switch (data->version) { case 1: data->namlen = 0; fallthrough; case 2: data->bsize = 0; fallthrough; case 3: if (data->flags & NFS_MOUNT_VER3) goto out_no_v3; data->root.size = NFS2_FHSIZE; memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); /* Turn off security negotiation */ extra_flags |= NFS_MOUNT_SECFLAVOUR; fallthrough; case 4: if (data->flags & NFS_MOUNT_SECFLAVOUR) goto out_no_sec; fallthrough; case 5: memset(data->context, 0, sizeof(data->context)); fallthrough; case 6: if (data->flags & NFS_MOUNT_VER3) { if (data->root.size > NFS3_FHSIZE || data->root.size == 0) goto out_invalid_fh; mntfh->size = data->root.size; ctx->version = 3; } else { mntfh->size = NFS2_FHSIZE; ctx->version = 2; } memcpy(mntfh->data, data->root.data, mntfh->size); if (mntfh->size < sizeof(mntfh->data)) memset(mntfh->data + mntfh->size, 0, sizeof(mntfh->data) - mntfh->size); /* * for proto == XPRT_TRANSPORT_UDP, which is what uses * to_exponential, implying shift: limit the shift value * to BITS_PER_LONG (majortimeo is unsigned long) */ if (!(data->flags & NFS_MOUNT_TCP)) /* this will be UDP */ if (data->retrans >= 64) /* shift value is too large */ goto out_invalid_data; /* * Translate to nfs_fs_context, which nfs_fill_super * can deal with. */ ctx->flags = data->flags & NFS_MOUNT_FLAGMASK; ctx->flags |= extra_flags; ctx->rsize = data->rsize; ctx->wsize = data->wsize; ctx->timeo = data->timeo; ctx->retrans = data->retrans; ctx->acregmin = data->acregmin; ctx->acregmax = data->acregmax; ctx->acdirmin = data->acdirmin; ctx->acdirmax = data->acdirmax; ctx->need_mount = false; memcpy(sap, &data->addr, sizeof(data->addr)); ctx->nfs_server.addrlen = sizeof(data->addr); ctx->nfs_server.port = ntohs(data->addr.sin_port); if (sap->ss_family != AF_INET || !nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; /* N.B. caller will free nfs_server.hostname in all cases */ ctx->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); if (!ctx->nfs_server.hostname) goto out_nomem; ctx->namlen = data->namlen; ctx->bsize = data->bsize; if (data->flags & NFS_MOUNT_SECFLAVOUR) ctx->selected_flavor = data->pseudoflavor; else ctx->selected_flavor = RPC_AUTH_UNIX; if (!(data->flags & NFS_MOUNT_NONLM)) ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK| NFS_MOUNT_LOCAL_FCNTL); else ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK| NFS_MOUNT_LOCAL_FCNTL); /* * The legacy version 6 binary mount data from userspace has a * field used only to transport selinux information into the * kernel. To continue to support that functionality we * have a touch of selinux knowledge here in the NFS code. The * userspace code converted context=blah to just blah so we are * converting back to the full string selinux understands. */ if (data->context[0]){ #ifdef CONFIG_SECURITY_SELINUX int ret; data->context[NFS_MAX_CONTEXT_LEN] = '\0'; ret = vfs_parse_fs_string(fc, "context", data->context, strlen(data->context)); if (ret < 0) return ret; #else return -EINVAL; #endif } break; default: goto generic; } ret = nfs_validate_transport_protocol(fc, ctx); if (ret) return ret; ctx->skip_reconfig_option_check = true; return 0; generic: return generic_parse_monolithic(fc, data); out_no_data: if (is_remount_fc(fc)) { ctx->skip_reconfig_option_check = true; return 0; } return nfs_invalf(fc, "NFS: mount program didn't pass any mount data"); out_no_v3: return nfs_invalf(fc, "NFS: nfs_mount_data version does not support v3"); out_no_sec: return nfs_invalf(fc, "NFS: nfs_mount_data version supports only AUTH_SYS"); out_nomem: return -ENOMEM; out_no_address: return nfs_invalf(fc, "NFS: mount program didn't pass remote address"); out_invalid_fh: return nfs_invalf(fc, "NFS: invalid root filehandle"); out_invalid_data: return nfs_invalf(fc, "NFS: invalid binary mount data"); } #if IS_ENABLED(CONFIG_NFS_V4) struct compat_nfs_string { compat_uint_t len; compat_uptr_t data; }; static inline void compat_nfs_string(struct nfs_string *dst, struct compat_nfs_string *src) { dst->data = compat_ptr(src->data); dst->len = src->len; } struct compat_nfs4_mount_data_v1 { compat_int_t version; compat_int_t flags; compat_int_t rsize; compat_int_t wsize; compat_int_t timeo; compat_int_t retrans; compat_int_t acregmin; compat_int_t acregmax; compat_int_t acdirmin; compat_int_t acdirmax; struct compat_nfs_string client_addr; struct compat_nfs_string mnt_path; struct compat_nfs_string hostname; compat_uint_t host_addrlen; compat_uptr_t host_addr; compat_int_t proto; compat_int_t auth_flavourlen; compat_uptr_t auth_flavours; }; static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data) { struct compat_nfs4_mount_data_v1 *compat = (struct compat_nfs4_mount_data_v1 *)data; /* copy the fields backwards */ data->auth_flavours = compat_ptr(compat->auth_flavours); data->auth_flavourlen = compat->auth_flavourlen; data->proto = compat->proto; data->host_addr = compat_ptr(compat->host_addr); data->host_addrlen = compat->host_addrlen; compat_nfs_string(&data->hostname, &compat->hostname); compat_nfs_string(&data->mnt_path, &compat->mnt_path); compat_nfs_string(&data->client_addr, &compat->client_addr); data->acdirmax = compat->acdirmax; data->acdirmin = compat->acdirmin; data->acregmax = compat->acregmax; data->acregmin = compat->acregmin; data->retrans = compat->retrans; data->timeo = compat->timeo; data->wsize = compat->wsize; data->rsize = compat->rsize; data->flags = compat->flags; data->version = compat->version; } /* * Validate NFSv4 mount options */ static int nfs4_parse_monolithic(struct fs_context *fc, struct nfs4_mount_data *data) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct sockaddr_storage *sap = &ctx->nfs_server._address; int ret; char *c; if (!data) { if (is_remount_fc(fc)) goto done; return nfs_invalf(fc, "NFS4: mount program didn't pass any mount data"); } ctx->version = 4; if (data->version != 1) return generic_parse_monolithic(fc, data); if (in_compat_syscall()) nfs4_compat_mount_data_conv(data); if (data->host_addrlen > sizeof(ctx->nfs_server.address)) goto out_no_address; if (data->host_addrlen == 0) goto out_no_address; ctx->nfs_server.addrlen = data->host_addrlen; if (copy_from_user(sap, data->host_addr, data->host_addrlen)) return -EFAULT; if (!nfs_verify_server_address(sap)) goto out_no_address; ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); if (data->auth_flavourlen) { rpc_authflavor_t pseudoflavor; if (data->auth_flavourlen > 1) goto out_inval_auth; if (copy_from_user(&pseudoflavor, data->auth_flavours, sizeof(pseudoflavor))) return -EFAULT; ctx->selected_flavor = pseudoflavor; } else { ctx->selected_flavor = RPC_AUTH_UNIX; } c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); if (IS_ERR(c)) return PTR_ERR(c); ctx->nfs_server.hostname = c; c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); if (IS_ERR(c)) return PTR_ERR(c); ctx->nfs_server.export_path = c; trace_nfs_mount_path(c); c = strndup_user(data->client_addr.data, 16); if (IS_ERR(c)) return PTR_ERR(c); ctx->client_address = c; /* * Translate to nfs_fs_context, which nfs_fill_super * can deal with. */ ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK; ctx->rsize = data->rsize; ctx->wsize = data->wsize; ctx->timeo = data->timeo; ctx->retrans = data->retrans; ctx->acregmin = data->acregmin; ctx->acregmax = data->acregmax; ctx->acdirmin = data->acdirmin; ctx->acdirmax = data->acdirmax; ctx->nfs_server.protocol = data->proto; ret = nfs_validate_transport_protocol(fc, ctx); if (ret) return ret; done: ctx->skip_reconfig_option_check = true; return 0; out_inval_auth: return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d", data->auth_flavourlen); out_no_address: return nfs_invalf(fc, "NFS4: mount program didn't pass remote address"); } #endif /* * Parse a monolithic block of data from sys_mount(). */ static int nfs_fs_context_parse_monolithic(struct fs_context *fc, void *data) { if (fc->fs_type == &nfs_fs_type) return nfs23_parse_monolithic(fc, data); #if IS_ENABLED(CONFIG_NFS_V4) if (fc->fs_type == &nfs4_fs_type) return nfs4_parse_monolithic(fc, data); #endif return nfs_invalf(fc, "NFS: Unsupported monolithic data version"); } /* * Validate the preparsed information in the config. */ static int nfs_fs_context_validate(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_subversion *nfs_mod; struct sockaddr_storage *sap = &ctx->nfs_server._address; int max_namelen = PAGE_SIZE; int max_pathlen = NFS_MAXPATHLEN; int port = 0; int ret; if (!fc->source) goto out_no_device_name; /* Check for sanity first. */ if (ctx->minorversion && ctx->version != 4) goto out_minorversion_mismatch; if (ctx->options & NFS_OPTION_MIGRATION && (ctx->version != 4 || ctx->minorversion != 0)) goto out_migration_misuse; /* Verify that any proto=/mountproto= options match the address * families in the addr=/mountaddr= options. */ if (ctx->protofamily != AF_UNSPEC && ctx->protofamily != ctx->nfs_server.address.sa_family) goto out_proto_mismatch; if (ctx->mountfamily != AF_UNSPEC) { if (ctx->mount_server.addrlen) { if (ctx->mountfamily != ctx->mount_server.address.sa_family) goto out_mountproto_mismatch; } else { if (ctx->mountfamily != ctx->nfs_server.address.sa_family) goto out_mountproto_mismatch; } } if (!nfs_verify_server_address(sap)) goto out_no_address; ret = nfs_validate_transport_protocol(fc, ctx); if (ret) return ret; if (ctx->version == 4) { if (IS_ENABLED(CONFIG_NFS_V4)) { if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA) port = NFS_RDMA_PORT; else port = NFS_PORT; max_namelen = NFS4_MAXNAMLEN; max_pathlen = NFS4_MAXPATHLEN; ctx->flags &= ~(NFS_MOUNT_NONLM | NFS_MOUNT_NOACL | NFS_MOUNT_VER3 | NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); } else { goto out_v4_not_compiled; } } else { nfs_set_mount_transport_protocol(ctx); if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA) port = NFS_RDMA_PORT; } nfs_set_port(sap, &ctx->nfs_server.port, port); ret = nfs_parse_source(fc, max_namelen, max_pathlen); if (ret < 0) return ret; /* Load the NFS protocol module if we haven't done so yet */ if (!ctx->nfs_mod) { nfs_mod = get_nfs_version(ctx->version); if (IS_ERR(nfs_mod)) { ret = PTR_ERR(nfs_mod); goto out_version_unavailable; } ctx->nfs_mod = nfs_mod; } /* Ensure the filesystem context has the correct fs_type */ if (fc->fs_type != ctx->nfs_mod->nfs_fs) { module_put(fc->fs_type->owner); __module_get(ctx->nfs_mod->nfs_fs->owner); fc->fs_type = ctx->nfs_mod->nfs_fs; } return 0; out_no_device_name: return nfs_invalf(fc, "NFS: Device name not specified"); out_v4_not_compiled: nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel"); return -EPROTONOSUPPORT; out_no_address: return nfs_invalf(fc, "NFS: mount program didn't pass remote address"); out_mountproto_mismatch: return nfs_invalf(fc, "NFS: Mount server address does not match mountproto= option"); out_proto_mismatch: return nfs_invalf(fc, "NFS: Server address does not match proto= option"); out_minorversion_mismatch: return nfs_invalf(fc, "NFS: Mount option vers=%u does not support minorversion=%u", ctx->version, ctx->minorversion); out_migration_misuse: return nfs_invalf(fc, "NFS: 'Migration' not supported for this NFS version"); out_version_unavailable: nfs_errorf(fc, "NFS: Version unavailable"); return ret; } /* * Create an NFS superblock by the appropriate method. */ static int nfs_get_tree(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); int err = nfs_fs_context_validate(fc); if (err) return err; if (!ctx->internal) return ctx->nfs_mod->rpc_ops->try_get_tree(fc); else return nfs_get_tree_common(fc); } /* * Handle duplication of a configuration. The caller copied *src into *sc, but * it can't deal with resource pointers in the filesystem context, so we have * to do that. We need to clear pointers, copy data or get extra refs as * appropriate. */ static int nfs_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { struct nfs_fs_context *src = nfs_fc2context(src_fc), *ctx; ctx = kmemdup(src, sizeof(struct nfs_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->mntfh = nfs_alloc_fhandle(); if (!ctx->mntfh) { kfree(ctx); return -ENOMEM; } nfs_copy_fh(ctx->mntfh, src->mntfh); __module_get(ctx->nfs_mod->owner); ctx->client_address = NULL; ctx->mount_server.hostname = NULL; ctx->nfs_server.export_path = NULL; ctx->nfs_server.hostname = NULL; ctx->fscache_uniq = NULL; ctx->clone_data.fattr = NULL; fc->fs_private = ctx; return 0; } static void nfs_fs_context_free(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); if (ctx) { if (ctx->server) nfs_free_server(ctx->server); if (ctx->nfs_mod) put_nfs_version(ctx->nfs_mod); kfree(ctx->client_address); kfree(ctx->mount_server.hostname); kfree(ctx->nfs_server.export_path); kfree(ctx->nfs_server.hostname); kfree(ctx->fscache_uniq); nfs_free_fhandle(ctx->mntfh); nfs_free_fattr(ctx->clone_data.fattr); kfree(ctx); } } static const struct fs_context_operations nfs_fs_context_ops = { .free = nfs_fs_context_free, .dup = nfs_fs_context_dup, .parse_param = nfs_fs_context_parse_param, .parse_monolithic = nfs_fs_context_parse_monolithic, .get_tree = nfs_get_tree, .reconfigure = nfs_reconfigure, }; /* * Prepare superblock configuration. We use the namespaces attached to the * context. This may be the current process's namespaces, or it may be a * container's namespaces. */ static int nfs_init_fs_context(struct fs_context *fc) { struct nfs_fs_context *ctx; ctx = kzalloc(sizeof(struct nfs_fs_context), GFP_KERNEL); if (unlikely(!ctx)) return -ENOMEM; ctx->mntfh = nfs_alloc_fhandle(); if (unlikely(!ctx->mntfh)) { kfree(ctx); return -ENOMEM; } ctx->protofamily = AF_UNSPEC; ctx->mountfamily = AF_UNSPEC; ctx->mount_server.port = NFS_UNSPEC_PORT; if (fc->root) { /* reconfigure, start with the current config */ struct nfs_server *nfss = fc->root->d_sb->s_fs_info; struct net *net = nfss->nfs_client->cl_net; ctx->flags = nfss->flags; ctx->rsize = nfss->rsize; ctx->wsize = nfss->wsize; ctx->retrans = nfss->client->cl_timeout->to_retries; ctx->selected_flavor = nfss->client->cl_auth->au_flavor; ctx->acregmin = nfss->acregmin / HZ; ctx->acregmax = nfss->acregmax / HZ; ctx->acdirmin = nfss->acdirmin / HZ; ctx->acdirmax = nfss->acdirmax / HZ; ctx->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; ctx->nfs_server.port = nfss->port; ctx->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; ctx->version = nfss->nfs_client->rpc_ops->version; ctx->minorversion = nfss->nfs_client->cl_minorversion; memcpy(&ctx->nfs_server._address, &nfss->nfs_client->cl_addr, ctx->nfs_server.addrlen); if (fc->net_ns != net) { put_net(fc->net_ns); fc->net_ns = get_net(net); } ctx->nfs_mod = nfss->nfs_client->cl_nfs_mod; __module_get(ctx->nfs_mod->owner); } else { /* defaults */ ctx->timeo = NFS_UNSPEC_TIMEO; ctx->retrans = NFS_UNSPEC_RETRANS; ctx->acregmin = NFS_DEF_ACREGMIN; ctx->acregmax = NFS_DEF_ACREGMAX; ctx->acdirmin = NFS_DEF_ACDIRMIN; ctx->acdirmax = NFS_DEF_ACDIRMAX; ctx->nfs_server.port = NFS_UNSPEC_PORT; ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; ctx->selected_flavor = RPC_AUTH_MAXFLAVOR; ctx->minorversion = 0; ctx->need_mount = true; ctx->xprtsec.policy = RPC_XPRTSEC_NONE; ctx->xprtsec.cert_serial = TLS_NO_CERT; ctx->xprtsec.privkey_serial = TLS_NO_PRIVKEY; fc->s_iflags |= SB_I_STABLE_WRITES; } fc->fs_private = ctx; fc->ops = &nfs_fs_context_ops; return 0; } struct file_system_type nfs_fs_type = { .owner = THIS_MODULE, .name = "nfs", .init_fs_context = nfs_init_fs_context, .parameters = nfs_fs_parameters, .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; MODULE_ALIAS_FS("nfs"); EXPORT_SYMBOL_GPL(nfs_fs_type); #if IS_ENABLED(CONFIG_NFS_V4) struct file_system_type nfs4_fs_type = { .owner = THIS_MODULE, .name = "nfs4", .init_fs_context = nfs_init_fs_context, .parameters = nfs_fs_parameters, .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; MODULE_ALIAS_FS("nfs4"); MODULE_ALIAS("nfs4"); EXPORT_SYMBOL_GPL(nfs4_fs_type); #endif /* CONFIG_NFS_V4 */
324 326 304 304 305 303 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 // SPDX-License-Identifier: GPL-2.0 /* * of.c The helpers for hcd device tree support * * Copyright (C) 2016 Freescale Semiconductor, Inc. * Author: Peter Chen <peter.chen@freescale.com> * Copyright (C) 2017 Johan Hovold <johan@kernel.org> */ #include <linux/of.h> #include <linux/usb/of.h> /** * usb_of_get_device_node() - get a USB device node * @hub: hub to which device is connected * @port1: one-based index of port * * Look up the node of a USB device given its parent hub device and one-based * port number. * * Return: A pointer to the node with incremented refcount if found, or * %NULL otherwise. */ struct device_node *usb_of_get_device_node(struct usb_device *hub, int port1) { struct device_node *node; u32 reg; for_each_child_of_node(hub->dev.of_node, node) { if (of_property_read_u32(node, "reg", &reg)) continue; if (reg == port1) return node; } return NULL; } EXPORT_SYMBOL_GPL(usb_of_get_device_node); /** * usb_of_has_combined_node() - determine whether a device has a combined node * @udev: USB device * * Determine whether a USB device has a so called combined node which is * shared with its sole interface. This is the case if and only if the device * has a node and its descriptors report the following: * * 1) bDeviceClass is 0 or 9, and * 2) bNumConfigurations is 1, and * 3) bNumInterfaces is 1. * * Return: True iff the device has a device node and its descriptors match the * criteria for a combined node. */ bool usb_of_has_combined_node(struct usb_device *udev) { struct usb_device_descriptor *ddesc = &udev->descriptor; struct usb_config_descriptor *cdesc; if (!udev->dev.of_node) return false; switch (ddesc->bDeviceClass) { case USB_CLASS_PER_INTERFACE: case USB_CLASS_HUB: if (ddesc->bNumConfigurations == 1) { cdesc = &udev->config->desc; if (cdesc->bNumInterfaces == 1) return true; } } return false; } EXPORT_SYMBOL_GPL(usb_of_has_combined_node); /** * usb_of_get_interface_node() - get a USB interface node * @udev: USB device of interface * @config: configuration value * @ifnum: interface number * * Look up the node of a USB interface given its USB device, configuration * value and interface number. * * Return: A pointer to the node with incremented refcount if found, or * %NULL otherwise. */ struct device_node * usb_of_get_interface_node(struct usb_device *udev, u8 config, u8 ifnum) { struct device_node *node; u32 reg[2]; for_each_child_of_node(udev->dev.of_node, node) { if (of_property_read_u32_array(node, "reg", reg, 2)) continue; if (reg[0] == ifnum && reg[1] == config) return node; } return NULL; } EXPORT_SYMBOL_GPL(usb_of_get_interface_node);
2334 4 173 173 1207 727 727 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_PGALLOC_H #define __ASM_GENERIC_PGALLOC_H #ifdef CONFIG_MMU #define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO) #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) /** * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context * * This function is intended for architectures that need * anything beyond simple page allocation. * * Return: pointer to the allocated memory or %NULL on error */ static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm) { struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); if (!ptdesc) return NULL; return ptdesc_address(ptdesc); } #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL /** * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context * * Return: pointer to the allocated memory or %NULL on error */ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { return __pte_alloc_one_kernel(mm); } #endif /** * pte_free_kernel - free PTE-level kernel page table memory * @mm: the mm_struct of the current context * @pte: pointer to the memory containing the page table */ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { pagetable_free(virt_to_ptdesc(pte)); } /** * __pte_alloc_one - allocate memory for a PTE-level user page table * @mm: the mm_struct of the current context * @gfp: GFP flags to use for the allocation * * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor(). * * This function is intended for architectures that need * anything beyond simple page allocation or must have custom GFP flags. * * Return: `struct page` referencing the ptdesc or %NULL on error */ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) { struct ptdesc *ptdesc; ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_page(ptdesc); } #ifndef __HAVE_ARCH_PTE_ALLOC_ONE /** * pte_alloc_one - allocate a page for PTE-level user page table * @mm: the mm_struct of the current context * * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor(). * * Return: `struct page` referencing the ptdesc or %NULL on error */ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { return __pte_alloc_one(mm, GFP_PGTABLE_USER); } #endif /* * Should really implement gc for free page table pages. This could be * done with a reference count in struct page. */ /** * pte_free - free PTE-level user page table memory * @mm: the mm_struct of the current context * @pte_page: the `struct page` referencing the ptdesc */ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); pagetable_pte_dtor(ptdesc); pagetable_free(ptdesc); } #if CONFIG_PGTABLE_LEVELS > 2 #ifndef __HAVE_ARCH_PMD_ALLOC_ONE /** * pmd_alloc_one - allocate memory for a PMD-level page table * @mm: the mm_struct of the current context * * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor(). * * Allocations use %GFP_PGTABLE_USER in user context and * %GFP_PGTABLE_KERNEL in kernel context. * * Return: pointer to the allocated memory or %NULL on error */ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { struct ptdesc *ptdesc; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pmd_ctor(ptdesc)) { pagetable_free(ptdesc); return NULL; } return ptdesc_address(ptdesc); } #endif #ifndef __HAVE_ARCH_PMD_FREE static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); pagetable_pmd_dtor(ptdesc); pagetable_free(ptdesc); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) return NULL; pagetable_pud_ctor(ptdesc); return ptdesc_address(ptdesc); } #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** * pud_alloc_one - allocate memory for a PUD-level page table * @mm: the mm_struct of the current context * * Allocate memory for a page table using %GFP_PGTABLE_USER for user context * and %GFP_PGTABLE_KERNEL for kernel context. * * Return: pointer to the allocated memory or %NULL on error */ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { return __pud_alloc_one(mm, addr); } #endif static inline void __pud_free(struct mm_struct *mm, pud_t *pud) { struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); pagetable_pud_dtor(ptdesc); pagetable_free(ptdesc); } #ifndef __HAVE_ARCH_PUD_FREE static inline void pud_free(struct mm_struct *mm, pud_t *pud) { __pud_free(mm, pud); } #endif #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pagetable_free(virt_to_ptdesc(pgd)); } #endif #endif /* CONFIG_MMU */ #endif /* __ASM_GENERIC_PGALLOC_H */
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 // SPDX-License-Identifier: GPL-2.0-only /* * pcrypt - Parallel crypto wrapper. * * Copyright (C) 2009 secunet Security Networks AG * Copyright (C) 2009 Steffen Klassert <steffen.klassert@secunet.com> */ #include <crypto/algapi.h> #include <crypto/internal/aead.h> #include <linux/atomic.h> #include <linux/err.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kobject.h> #include <linux/cpu.h> #include <crypto/pcrypt.h> static struct padata_instance *pencrypt; static struct padata_instance *pdecrypt; static struct kset *pcrypt_kset; struct pcrypt_instance_ctx { struct crypto_aead_spawn spawn; struct padata_shell *psenc; struct padata_shell *psdec; atomic_t tfm_count; }; struct pcrypt_aead_ctx { struct crypto_aead *child; unsigned int cb_cpu; }; static inline struct pcrypt_instance_ctx *pcrypt_tfm_ictx( struct crypto_aead *tfm) { return aead_instance_ctx(aead_alg_instance(tfm)); } static int pcrypt_aead_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(parent); return crypto_aead_setkey(ctx->child, key, keylen); } static int pcrypt_aead_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(parent); return crypto_aead_setauthsize(ctx->child, authsize); } static void pcrypt_aead_serial(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); aead_request_complete(req->base.data, padata->info); } static void pcrypt_aead_done(void *data, int err) { struct aead_request *req = data; struct pcrypt_request *preq = aead_request_ctx(req); struct padata_priv *padata = pcrypt_request_padata(preq); padata->info = err; padata_do_serial(padata); } static void pcrypt_aead_enc(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); int ret; ret = crypto_aead_encrypt(req); if (ret == -EINPROGRESS) return; padata->info = ret; padata_do_serial(padata); } static int pcrypt_aead_encrypt(struct aead_request *req) { int err; struct pcrypt_request *preq = aead_request_ctx(req); struct aead_request *creq = pcrypt_request_ctx(preq); struct padata_priv *padata = pcrypt_request_padata(preq); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(aead); u32 flags = aead_request_flags(req); struct pcrypt_instance_ctx *ictx; ictx = pcrypt_tfm_ictx(aead); memset(padata, 0, sizeof(struct padata_priv)); padata->parallel = pcrypt_aead_enc; padata->serial = pcrypt_aead_serial; aead_request_set_tfm(creq, ctx->child); aead_request_set_callback(creq, flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, pcrypt_aead_done, req); aead_request_set_crypt(creq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); err = padata_do_parallel(ictx->psenc, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; if (err == -EBUSY) return -EAGAIN; return err; } static void pcrypt_aead_dec(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); int ret; ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) return; padata->info = ret; padata_do_serial(padata); } static int pcrypt_aead_decrypt(struct aead_request *req) { int err; struct pcrypt_request *preq = aead_request_ctx(req); struct aead_request *creq = pcrypt_request_ctx(preq); struct padata_priv *padata = pcrypt_request_padata(preq); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(aead); u32 flags = aead_request_flags(req); struct pcrypt_instance_ctx *ictx; ictx = pcrypt_tfm_ictx(aead); memset(padata, 0, sizeof(struct padata_priv)); padata->parallel = pcrypt_aead_dec; padata->serial = pcrypt_aead_serial; aead_request_set_tfm(creq, ctx->child); aead_request_set_callback(creq, flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, pcrypt_aead_done, req); aead_request_set_crypt(creq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); err = padata_do_parallel(ictx->psdec, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; if (err == -EBUSY) return -EAGAIN; return err; } static int pcrypt_aead_init_tfm(struct crypto_aead *tfm) { int cpu, cpu_index; struct aead_instance *inst = aead_alg_instance(tfm); struct pcrypt_instance_ctx *ictx = aead_instance_ctx(inst); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *cipher; cpu_index = (unsigned int)atomic_inc_return(&ictx->tfm_count) % cpumask_weight(cpu_online_mask); ctx->cb_cpu = cpumask_first(cpu_online_mask); for (cpu = 0; cpu < cpu_index; cpu++) ctx->cb_cpu = cpumask_next(ctx->cb_cpu, cpu_online_mask); cipher = crypto_spawn_aead(&ictx->spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; crypto_aead_set_reqsize(tfm, sizeof(struct pcrypt_request) + sizeof(struct aead_request) + crypto_aead_reqsize(cipher)); return 0; } static void pcrypt_aead_exit_tfm(struct crypto_aead *tfm) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void pcrypt_free(struct aead_instance *inst) { struct pcrypt_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_aead(&ctx->spawn); padata_free_shell(ctx->psdec); padata_free_shell(ctx->psenc); kfree(inst); } static int pcrypt_init_instance(struct crypto_instance *inst, struct crypto_alg *alg) { if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "pcrypt(%s)", alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; memcpy(inst->alg.cra_name, alg->cra_name, CRYPTO_MAX_ALG_NAME); inst->alg.cra_priority = alg->cra_priority + 100; inst->alg.cra_blocksize = alg->cra_blocksize; inst->alg.cra_alignmask = alg->cra_alignmask; return 0; } static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb, struct crypto_attr_type *algt) { struct pcrypt_instance_ctx *ctx; struct aead_instance *inst; struct aead_alg *alg; u32 mask = crypto_algt_inherited_mask(algt); int err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; err = -ENOMEM; ctx = aead_instance_ctx(inst); ctx->psenc = padata_alloc_shell(pencrypt); if (!ctx->psenc) goto err_free_inst; ctx->psdec = padata_alloc_shell(pdecrypt); if (!ctx->psdec) goto err_free_inst; err = crypto_grab_aead(&ctx->spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(&ctx->spawn); err = pcrypt_init_instance(aead_crypto_instance(inst), &alg->base); if (err) goto err_free_inst; inst->alg.base.cra_flags |= CRYPTO_ALG_ASYNC; inst->alg.ivsize = crypto_aead_alg_ivsize(alg); inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg); inst->alg.base.cra_ctxsize = sizeof(struct pcrypt_aead_ctx); inst->alg.init = pcrypt_aead_init_tfm; inst->alg.exit = pcrypt_aead_exit_tfm; inst->alg.setkey = pcrypt_aead_setkey; inst->alg.setauthsize = pcrypt_aead_setauthsize; inst->alg.encrypt = pcrypt_aead_encrypt; inst->alg.decrypt = pcrypt_aead_decrypt; inst->free = pcrypt_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: pcrypt_free(inst); } return err; } static int pcrypt_create(struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_attr_type *algt; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_AEAD: return pcrypt_create_aead(tmpl, tb, algt); } return -EINVAL; } static int pcrypt_sysfs_add(struct padata_instance *pinst, const char *name) { int ret; pinst->kobj.kset = pcrypt_kset; ret = kobject_add(&pinst->kobj, NULL, "%s", name); if (!ret) kobject_uevent(&pinst->kobj, KOBJ_ADD); return ret; } static int pcrypt_init_padata(struct padata_instance **pinst, const char *name) { int ret = -ENOMEM; *pinst = padata_alloc(name); if (!*pinst) return ret; ret = pcrypt_sysfs_add(*pinst, name); if (ret) padata_free(*pinst); return ret; } static struct crypto_template pcrypt_tmpl = { .name = "pcrypt", .create = pcrypt_create, .module = THIS_MODULE, }; static int __init pcrypt_init(void) { int err = -ENOMEM; pcrypt_kset = kset_create_and_add("pcrypt", NULL, kernel_kobj); if (!pcrypt_kset) goto err; err = pcrypt_init_padata(&pencrypt, "pencrypt"); if (err) goto err_unreg_kset; err = pcrypt_init_padata(&pdecrypt, "pdecrypt"); if (err) goto err_deinit_pencrypt; return crypto_register_template(&pcrypt_tmpl); err_deinit_pencrypt: padata_free(pencrypt); err_unreg_kset: kset_unregister(pcrypt_kset); err: return err; } static void __exit pcrypt_exit(void) { crypto_unregister_template(&pcrypt_tmpl); padata_free(pencrypt); padata_free(pdecrypt); kset_unregister(pcrypt_kset); } subsys_initcall(pcrypt_init); module_exit(pcrypt_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("Parallel crypto wrapper"); MODULE_ALIAS_CRYPTO("pcrypt");
334 358 2 359 1 359 357 357 358 359 3 1 2 4 1 1 3 324 11 325 261 1 38 247 260 260 104 8 112 239 237 230 20 10 25 227 294 33 261 88 326 310 84 235 323 324 4 13 13 13 7 165 135 5 177 145 325 327 339 286 291 93 323 140 157 107 285 284 286 286 286 287 13 53 33 263 261 279 325 336 325 150 57 126 317 165 48 9 5 37 37 37 6 5 11 11 11 5 5 3 2 2 2 2 2 261 261 316 305 326 324 144 182 325 321 322 323 323 322 303 273 37 305 304 305 303 323 8 7 6 8 2 321 261 302 302 241 323 50 48 2 4 7 42 3 43 4 42 2 2 5 6 1 23 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 // SPDX-License-Identifier: GPL-2.0 /* * message.c - synchronous message handling * * Released under the GPLv2 only. */ #include <linux/acpi.h> #include <linux/pci.h> /* for scatterlist macros */ #include <linux/usb.h> #include <linux/module.h> #include <linux/of.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/timer.h> #include <linux/ctype.h> #include <linux/nls.h> #include <linux/device.h> #include <linux/scatterlist.h> #include <linux/usb/cdc.h> #include <linux/usb/quirks.h> #include <linux/usb/hcd.h> /* for usbcore internals */ #include <linux/usb/of.h> #include <asm/byteorder.h> #include "usb.h" static void cancel_async_set_config(struct usb_device *udev); struct api_context { struct completion done; int status; }; static void usb_api_blocking_completion(struct urb *urb) { struct api_context *ctx = urb->context; ctx->status = urb->status; complete(&ctx->done); } /* * Starts urb and waits for completion or timeout. Note that this call * is NOT interruptible. Many device driver i/o requests should be * interruptible and therefore these drivers should implement their * own interruptible routines. */ static int usb_start_wait_urb(struct urb *urb, int timeout, int *actual_length) { struct api_context ctx; unsigned long expire; int retval; init_completion(&ctx.done); urb->context = &ctx; urb->actual_length = 0; retval = usb_submit_urb(urb, GFP_NOIO); if (unlikely(retval)) goto out; expire = timeout ? msecs_to_jiffies(timeout) : MAX_SCHEDULE_TIMEOUT; if (!wait_for_completion_timeout(&ctx.done, expire)) { usb_kill_urb(urb); retval = (ctx.status == -ENOENT ? -ETIMEDOUT : ctx.status); dev_dbg(&urb->dev->dev, "%s timed out on ep%d%s len=%u/%u\n", current->comm, usb_endpoint_num(&urb->ep->desc), usb_urb_dir_in(urb) ? "in" : "out", urb->actual_length, urb->transfer_buffer_length); } else retval = ctx.status; out: if (actual_length) *actual_length = urb->actual_length; usb_free_urb(urb); return retval; } /*-------------------------------------------------------------------*/ /* returns status (negative) or length (positive) */ static int usb_internal_control_msg(struct usb_device *usb_dev, unsigned int pipe, struct usb_ctrlrequest *cmd, void *data, int len, int timeout) { struct urb *urb; int retv; int length; urb = usb_alloc_urb(0, GFP_NOIO); if (!urb) return -ENOMEM; usb_fill_control_urb(urb, usb_dev, pipe, (unsigned char *)cmd, data, len, usb_api_blocking_completion, NULL); retv = usb_start_wait_urb(urb, timeout, &length); if (retv < 0) return retv; else return length; } /** * usb_control_msg - Builds a control urb, sends it off and waits for completion * @dev: pointer to the usb device to send the message to * @pipe: endpoint "pipe" to send the message to * @request: USB message request value * @requesttype: USB message request type value * @value: USB message value * @index: USB message index value * @data: pointer to the data to send * @size: length in bytes of the data to send * @timeout: time in msecs to wait for the message to complete before timing * out (if 0 the wait is forever) * * Context: task context, might sleep. * * This function sends a simple control message to a specified endpoint and * waits for the message to complete, or timeout. * * Don't use this function from within an interrupt context. If you need * an asynchronous message, or need to send a message from within interrupt * context, use usb_submit_urb(). If a thread in your driver uses this call, * make sure your disconnect() method can wait for it to complete. Since you * don't have a handle on the URB used, you can't cancel the request. * * Return: If successful, the number of bytes transferred. Otherwise, a negative * error number. */ int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request, __u8 requesttype, __u16 value, __u16 index, void *data, __u16 size, int timeout) { struct usb_ctrlrequest *dr; int ret; dr = kmalloc(sizeof(struct usb_ctrlrequest), GFP_NOIO); if (!dr) return -ENOMEM; dr->bRequestType = requesttype; dr->bRequest = request; dr->wValue = cpu_to_le16(value); dr->wIndex = cpu_to_le16(index); dr->wLength = cpu_to_le16(size); ret = usb_internal_control_msg(dev, pipe, dr, data, size, timeout); /* Linger a bit, prior to the next control message. */ if (dev->quirks & USB_QUIRK_DELAY_CTRL_MSG) msleep(200); kfree(dr); return ret; } EXPORT_SYMBOL_GPL(usb_control_msg); /** * usb_control_msg_send - Builds a control "send" message, sends it off and waits for completion * @dev: pointer to the usb device to send the message to * @endpoint: endpoint to send the message to * @request: USB message request value * @requesttype: USB message request type value * @value: USB message value * @index: USB message index value * @driver_data: pointer to the data to send * @size: length in bytes of the data to send * @timeout: time in msecs to wait for the message to complete before timing * out (if 0 the wait is forever) * @memflags: the flags for memory allocation for buffers * * Context: !in_interrupt () * * This function sends a control message to a specified endpoint that is not * expected to fill in a response (i.e. a "send message") and waits for the * message to complete, or timeout. * * Do not use this function from within an interrupt context. If you need * an asynchronous message, or need to send a message from within interrupt * context, use usb_submit_urb(). If a thread in your driver uses this call, * make sure your disconnect() method can wait for it to complete. Since you * don't have a handle on the URB used, you can't cancel the request. * * The data pointer can be made to a reference on the stack, or anywhere else, * as it will not be modified at all. This does not have the restriction that * usb_control_msg() has where the data pointer must be to dynamically allocated * memory (i.e. memory that can be successfully DMAed to a device). * * Return: If successful, 0 is returned, Otherwise, a negative error number. */ int usb_control_msg_send(struct usb_device *dev, __u8 endpoint, __u8 request, __u8 requesttype, __u16 value, __u16 index, const void *driver_data, __u16 size, int timeout, gfp_t memflags) { unsigned int pipe = usb_sndctrlpipe(dev, endpoint); int ret; u8 *data = NULL; if (size) { data = kmemdup(driver_data, size, memflags); if (!data) return -ENOMEM; } ret = usb_control_msg(dev, pipe, request, requesttype, value, index, data, size, timeout); kfree(data); if (ret < 0) return ret; return 0; } EXPORT_SYMBOL_GPL(usb_control_msg_send); /** * usb_control_msg_recv - Builds a control "receive" message, sends it off and waits for completion * @dev: pointer to the usb device to send the message to * @endpoint: endpoint to send the message to * @request: USB message request value * @requesttype: USB message request type value * @value: USB message value * @index: USB message index value * @driver_data: pointer to the data to be filled in by the message * @size: length in bytes of the data to be received * @timeout: time in msecs to wait for the message to complete before timing * out (if 0 the wait is forever) * @memflags: the flags for memory allocation for buffers * * Context: !in_interrupt () * * This function sends a control message to a specified endpoint that is * expected to fill in a response (i.e. a "receive message") and waits for the * message to complete, or timeout. * * Do not use this function from within an interrupt context. If you need * an asynchronous message, or need to send a message from within interrupt * context, use usb_submit_urb(). If a thread in your driver uses this call, * make sure your disconnect() method can wait for it to complete. Since you * don't have a handle on the URB used, you can't cancel the request. * * The data pointer can be made to a reference on the stack, or anywhere else * that can be successfully written to. This function does not have the * restriction that usb_control_msg() has where the data pointer must be to * dynamically allocated memory (i.e. memory that can be successfully DMAed to a * device). * * The "whole" message must be properly received from the device in order for * this function to be successful. If a device returns less than the expected * amount of data, then the function will fail. Do not use this for messages * where a variable amount of data might be returned. * * Return: If successful, 0 is returned, Otherwise, a negative error number. */ int usb_control_msg_recv(struct usb_device *dev, __u8 endpoint, __u8 request, __u8 requesttype, __u16 value, __u16 index, void *driver_data, __u16 size, int timeout, gfp_t memflags) { unsigned int pipe = usb_rcvctrlpipe(dev, endpoint); int ret; u8 *data; if (!size || !driver_data) return -EINVAL; data = kmalloc(size, memflags); if (!data) return -ENOMEM; ret = usb_control_msg(dev, pipe, request, requesttype, value, index, data, size, timeout); if (ret < 0) goto exit; if (ret == size) { memcpy(driver_data, data, size); ret = 0; } else { ret = -EREMOTEIO; } exit: kfree(data); return ret; } EXPORT_SYMBOL_GPL(usb_control_msg_recv); /** * usb_interrupt_msg - Builds an interrupt urb, sends it off and waits for completion * @usb_dev: pointer to the usb device to send the message to * @pipe: endpoint "pipe" to send the message to * @data: pointer to the data to send * @len: length in bytes of the data to send * @actual_length: pointer to a location to put the actual length transferred * in bytes * @timeout: time in msecs to wait for the message to complete before * timing out (if 0 the wait is forever) * * Context: task context, might sleep. * * This function sends a simple interrupt message to a specified endpoint and * waits for the message to complete, or timeout. * * Don't use this function from within an interrupt context. If you need * an asynchronous message, or need to send a message from within interrupt * context, use usb_submit_urb() If a thread in your driver uses this call, * make sure your disconnect() method can wait for it to complete. Since you * don't have a handle on the URB used, you can't cancel the request. * * Return: * If successful, 0. Otherwise a negative error number. The number of actual * bytes transferred will be stored in the @actual_length parameter. */ int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe, void *data, int len, int *actual_length, int timeout) { return usb_bulk_msg(usb_dev, pipe, data, len, actual_length, timeout); } EXPORT_SYMBOL_GPL(usb_interrupt_msg); /** * usb_bulk_msg - Builds a bulk urb, sends it off and waits for completion * @usb_dev: pointer to the usb device to send the message to * @pipe: endpoint "pipe" to send the message to * @data: pointer to the data to send * @len: length in bytes of the data to send * @actual_length: pointer to a location to put the actual length transferred * in bytes * @timeout: time in msecs to wait for the message to complete before * timing out (if 0 the wait is forever) * * Context: task context, might sleep. * * This function sends a simple bulk message to a specified endpoint * and waits for the message to complete, or timeout. * * Don't use this function from within an interrupt context. If you need * an asynchronous message, or need to send a message from within interrupt * context, use usb_submit_urb() If a thread in your driver uses this call, * make sure your disconnect() method can wait for it to complete. Since you * don't have a handle on the URB used, you can't cancel the request. * * Because there is no usb_interrupt_msg() and no USBDEVFS_INTERRUPT ioctl, * users are forced to abuse this routine by using it to submit URBs for * interrupt endpoints. We will take the liberty of creating an interrupt URB * (with the default interval) if the target is an interrupt endpoint. * * Return: * If successful, 0. Otherwise a negative error number. The number of actual * bytes transferred will be stored in the @actual_length parameter. * */ int usb_bulk_msg(struct usb_device *usb_dev, unsigned int pipe, void *data, int len, int *actual_length, int timeout) { struct urb *urb; struct usb_host_endpoint *ep; ep = usb_pipe_endpoint(usb_dev, pipe); if (!ep || len < 0) return -EINVAL; urb = usb_alloc_urb(0, GFP_KERNEL); if (!urb) return -ENOMEM; if ((ep->desc.bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_INT) { pipe = (pipe & ~(3 << 30)) | (PIPE_INTERRUPT << 30); usb_fill_int_urb(urb, usb_dev, pipe, data, len, usb_api_blocking_completion, NULL, ep->desc.bInterval); } else usb_fill_bulk_urb(urb, usb_dev, pipe, data, len, usb_api_blocking_completion, NULL); return usb_start_wait_urb(urb, timeout, actual_length); } EXPORT_SYMBOL_GPL(usb_bulk_msg); /*-------------------------------------------------------------------*/ static void sg_clean(struct usb_sg_request *io) { if (io->urbs) { while (io->entries--) usb_free_urb(io->urbs[io->entries]); kfree(io->urbs); io->urbs = NULL; } io->dev = NULL; } static void sg_complete(struct urb *urb) { unsigned long flags; struct usb_sg_request *io = urb->context; int status = urb->status; spin_lock_irqsave(&io->lock, flags); /* In 2.5 we require hcds' endpoint queues not to progress after fault * reports, until the completion callback (this!) returns. That lets * device driver code (like this routine) unlink queued urbs first, * if it needs to, since the HC won't work on them at all. So it's * not possible for page N+1 to overwrite page N, and so on. * * That's only for "hard" faults; "soft" faults (unlinks) sometimes * complete before the HCD can get requests away from hardware, * though never during cleanup after a hard fault. */ if (io->status && (io->status != -ECONNRESET || status != -ECONNRESET) && urb->actual_length) { dev_err(io->dev->bus->controller, "dev %s ep%d%s scatterlist error %d/%d\n", io->dev->devpath, usb_endpoint_num(&urb->ep->desc), usb_urb_dir_in(urb) ? "in" : "out", status, io->status); /* BUG (); */ } if (io->status == 0 && status && status != -ECONNRESET) { int i, found, retval; io->status = status; /* the previous urbs, and this one, completed already. * unlink pending urbs so they won't rx/tx bad data. * careful: unlink can sometimes be synchronous... */ spin_unlock_irqrestore(&io->lock, flags); for (i = 0, found = 0; i < io->entries; i++) { if (!io->urbs[i]) continue; if (found) { usb_block_urb(io->urbs[i]); retval = usb_unlink_urb(io->urbs[i]); if (retval != -EINPROGRESS && retval != -ENODEV && retval != -EBUSY && retval != -EIDRM) dev_err(&io->dev->dev, "%s, unlink --> %d\n", __func__, retval); } else if (urb == io->urbs[i]) found = 1; } spin_lock_irqsave(&io->lock, flags); } /* on the last completion, signal usb_sg_wait() */ io->bytes += urb->actual_length; io->count--; if (!io->count) complete(&io->complete); spin_unlock_irqrestore(&io->lock, flags); } /** * usb_sg_init - initializes scatterlist-based bulk/interrupt I/O request * @io: request block being initialized. until usb_sg_wait() returns, * treat this as a pointer to an opaque block of memory, * @dev: the usb device that will send or receive the data * @pipe: endpoint "pipe" used to transfer the data * @period: polling rate for interrupt endpoints, in frames or * (for high speed endpoints) microframes; ignored for bulk * @sg: scatterlist entries * @nents: how many entries in the scatterlist * @length: how many bytes to send from the scatterlist, or zero to * send every byte identified in the list. * @mem_flags: SLAB_* flags affecting memory allocations in this call * * This initializes a scatter/gather request, allocating resources such as * I/O mappings and urb memory (except maybe memory used by USB controller * drivers). * * The request must be issued using usb_sg_wait(), which waits for the I/O to * complete (or to be canceled) and then cleans up all resources allocated by * usb_sg_init(). * * The request may be canceled with usb_sg_cancel(), either before or after * usb_sg_wait() is called. * * Return: Zero for success, else a negative errno value. */ int usb_sg_init(struct usb_sg_request *io, struct usb_device *dev, unsigned pipe, unsigned period, struct scatterlist *sg, int nents, size_t length, gfp_t mem_flags) { int i; int urb_flags; int use_sg; if (!io || !dev || !sg || usb_pipecontrol(pipe) || usb_pipeisoc(pipe) || nents <= 0) return -EINVAL; spin_lock_init(&io->lock); io->dev = dev; io->pipe = pipe; if (dev->bus->sg_tablesize > 0) { use_sg = true; io->entries = 1; } else { use_sg = false; io->entries = nents; } /* initialize all the urbs we'll use */ io->urbs = kmalloc_array(io->entries, sizeof(*io->urbs), mem_flags); if (!io->urbs) goto nomem; urb_flags = URB_NO_INTERRUPT; if (usb_pipein(pipe)) urb_flags |= URB_SHORT_NOT_OK; for_each_sg(sg, sg, io->entries, i) { struct urb *urb; unsigned len; urb = usb_alloc_urb(0, mem_flags); if (!urb) { io->entries = i; goto nomem; } io->urbs[i] = urb; urb->dev = NULL; urb->pipe = pipe; urb->interval = period; urb->transfer_flags = urb_flags; urb->complete = sg_complete; urb->context = io; urb->sg = sg; if (use_sg) { /* There is no single transfer buffer */ urb->transfer_buffer = NULL; urb->num_sgs = nents; /* A length of zero means transfer the whole sg list */ len = length; if (len == 0) { struct scatterlist *sg2; int j; for_each_sg(sg, sg2, nents, j) len += sg2->length; } } else { /* * Some systems can't use DMA; they use PIO instead. * For their sakes, transfer_buffer is set whenever * possible. */ if (!PageHighMem(sg_page(sg))) urb->transfer_buffer = sg_virt(sg); else urb->transfer_buffer = NULL; len = sg->length; if (length) { len = min_t(size_t, len, length); length -= len; if (length == 0) io->entries = i + 1; } } urb->transfer_buffer_length = len; } io->urbs[--i]->transfer_flags &= ~URB_NO_INTERRUPT; /* transaction state */ io->count = io->entries; io->status = 0; io->bytes = 0; init_completion(&io->complete); return 0; nomem: sg_clean(io); return -ENOMEM; } EXPORT_SYMBOL_GPL(usb_sg_init); /** * usb_sg_wait - synchronously execute scatter/gather request * @io: request block handle, as initialized with usb_sg_init(). * some fields become accessible when this call returns. * * Context: task context, might sleep. * * This function blocks until the specified I/O operation completes. It * leverages the grouping of the related I/O requests to get good transfer * rates, by queueing the requests. At higher speeds, such queuing can * significantly improve USB throughput. * * There are three kinds of completion for this function. * * (1) success, where io->status is zero. The number of io->bytes * transferred is as requested. * (2) error, where io->status is a negative errno value. The number * of io->bytes transferred before the error is usually less * than requested, and can be nonzero. * (3) cancellation, a type of error with status -ECONNRESET that * is initiated by usb_sg_cancel(). * * When this function returns, all memory allocated through usb_sg_init() or * this call will have been freed. The request block parameter may still be * passed to usb_sg_cancel(), or it may be freed. It could also be * reinitialized and then reused. * * Data Transfer Rates: * * Bulk transfers are valid for full or high speed endpoints. * The best full speed data rate is 19 packets of 64 bytes each * per frame, or 1216 bytes per millisecond. * The best high speed data rate is 13 packets of 512 bytes each * per microframe, or 52 KBytes per millisecond. * * The reason to use interrupt transfers through this API would most likely * be to reserve high speed bandwidth, where up to 24 KBytes per millisecond * could be transferred. That capability is less useful for low or full * speed interrupt endpoints, which allow at most one packet per millisecond, * of at most 8 or 64 bytes (respectively). * * It is not necessary to call this function to reserve bandwidth for devices * under an xHCI host controller, as the bandwidth is reserved when the * configuration or interface alt setting is selected. */ void usb_sg_wait(struct usb_sg_request *io) { int i; int entries = io->entries; /* queue the urbs. */ spin_lock_irq(&io->lock); i = 0; while (i < entries && !io->status) { int retval; io->urbs[i]->dev = io->dev; spin_unlock_irq(&io->lock); retval = usb_submit_urb(io->urbs[i], GFP_NOIO); switch (retval) { /* maybe we retrying will recover */ case -ENXIO: /* hc didn't queue this one */ case -EAGAIN: case -ENOMEM: retval = 0; yield(); break; /* no error? continue immediately. * * NOTE: to work better with UHCI (4K I/O buffer may * need 3K of TDs) it may be good to limit how many * URBs are queued at once; N milliseconds? */ case 0: ++i; cpu_relax(); break; /* fail any uncompleted urbs */ default: io->urbs[i]->status = retval; dev_dbg(&io->dev->dev, "%s, submit --> %d\n", __func__, retval); usb_sg_cancel(io); } spin_lock_irq(&io->lock); if (retval && (io->status == 0 || io->status == -ECONNRESET)) io->status = retval; } io->count -= entries - i; if (io->count == 0) complete(&io->complete); spin_unlock_irq(&io->lock); /* OK, yes, this could be packaged as non-blocking. * So could the submit loop above ... but it's easier to * solve neither problem than to solve both! */ wait_for_completion(&io->complete); sg_clean(io); } EXPORT_SYMBOL_GPL(usb_sg_wait); /** * usb_sg_cancel - stop scatter/gather i/o issued by usb_sg_wait() * @io: request block, initialized with usb_sg_init() * * This stops a request after it has been started by usb_sg_wait(). * It can also prevents one initialized by usb_sg_init() from starting, * so that call just frees resources allocated to the request. */ void usb_sg_cancel(struct usb_sg_request *io) { unsigned long flags; int i, retval; spin_lock_irqsave(&io->lock, flags); if (io->status || io->count == 0) { spin_unlock_irqrestore(&io->lock, flags); return; } /* shut everything down */ io->status = -ECONNRESET; io->count++; /* Keep the request alive until we're done */ spin_unlock_irqrestore(&io->lock, flags); for (i = io->entries - 1; i >= 0; --i) { usb_block_urb(io->urbs[i]); retval = usb_unlink_urb(io->urbs[i]); if (retval != -EINPROGRESS && retval != -ENODEV && retval != -EBUSY && retval != -EIDRM) dev_warn(&io->dev->dev, "%s, unlink --> %d\n", __func__, retval); } spin_lock_irqsave(&io->lock, flags); io->count--; if (!io->count) complete(&io->complete); spin_unlock_irqrestore(&io->lock, flags); } EXPORT_SYMBOL_GPL(usb_sg_cancel); /*-------------------------------------------------------------------*/ /** * usb_get_descriptor - issues a generic GET_DESCRIPTOR request * @dev: the device whose descriptor is being retrieved * @type: the descriptor type (USB_DT_*) * @index: the number of the descriptor * @buf: where to put the descriptor * @size: how big is "buf"? * * Context: task context, might sleep. * * Gets a USB descriptor. Convenience functions exist to simplify * getting some types of descriptors. Use * usb_get_string() or usb_string() for USB_DT_STRING. * Device (USB_DT_DEVICE) and configuration descriptors (USB_DT_CONFIG) * are part of the device structure. * In addition to a number of USB-standard descriptors, some * devices also use class-specific or vendor-specific descriptors. * * This call is synchronous, and may not be used in an interrupt context. * * Return: The number of bytes received on success, or else the status code * returned by the underlying usb_control_msg() call. */ int usb_get_descriptor(struct usb_device *dev, unsigned char type, unsigned char index, void *buf, int size) { int i; int result; if (size <= 0) /* No point in asking for no data */ return -EINVAL; memset(buf, 0, size); /* Make sure we parse really received data */ for (i = 0; i < 3; ++i) { /* retry on length 0 or error; some devices are flakey */ result = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), USB_REQ_GET_DESCRIPTOR, USB_DIR_IN, (type << 8) + index, 0, buf, size, USB_CTRL_GET_TIMEOUT); if (result <= 0 && result != -ETIMEDOUT) continue; if (result > 1 && ((u8 *)buf)[1] != type) { result = -ENODATA; continue; } break; } return result; } EXPORT_SYMBOL_GPL(usb_get_descriptor); /** * usb_get_string - gets a string descriptor * @dev: the device whose string descriptor is being retrieved * @langid: code for language chosen (from string descriptor zero) * @index: the number of the descriptor * @buf: where to put the string * @size: how big is "buf"? * * Context: task context, might sleep. * * Retrieves a string, encoded using UTF-16LE (Unicode, 16 bits per character, * in little-endian byte order). * The usb_string() function will often be a convenient way to turn * these strings into kernel-printable form. * * Strings may be referenced in device, configuration, interface, or other * descriptors, and could also be used in vendor-specific ways. * * This call is synchronous, and may not be used in an interrupt context. * * Return: The number of bytes received on success, or else the status code * returned by the underlying usb_control_msg() call. */ static int usb_get_string(struct usb_device *dev, unsigned short langid, unsigned char index, void *buf, int size) { int i; int result; if (size <= 0) /* No point in asking for no data */ return -EINVAL; for (i = 0; i < 3; ++i) { /* retry on length 0 or stall; some devices are flakey */ result = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), USB_REQ_GET_DESCRIPTOR, USB_DIR_IN, (USB_DT_STRING << 8) + index, langid, buf, size, USB_CTRL_GET_TIMEOUT); if (result == 0 || result == -EPIPE) continue; if (result > 1 && ((u8 *) buf)[1] != USB_DT_STRING) { result = -ENODATA; continue; } break; } return result; } static void usb_try_string_workarounds(unsigned char *buf, int *length) { int newlength, oldlength = *length; for (newlength = 2; newlength + 1 < oldlength; newlength += 2) if (!isprint(buf[newlength]) || buf[newlength + 1]) break; if (newlength > 2) { buf[0] = newlength; *length = newlength; } } static int usb_string_sub(struct usb_device *dev, unsigned int langid, unsigned int index, unsigned char *buf) { int rc; /* Try to read the string descriptor by asking for the maximum * possible number of bytes */ if (dev->quirks & USB_QUIRK_STRING_FETCH_255) rc = -EIO; else rc = usb_get_string(dev, langid, index, buf, 255); /* If that failed try to read the descriptor length, then * ask for just that many bytes */ if (rc < 2) { rc = usb_get_string(dev, langid, index, buf, 2); if (rc == 2) rc = usb_get_string(dev, langid, index, buf, buf[0]); } if (rc >= 2) { if (!buf[0] && !buf[1]) usb_try_string_workarounds(buf, &rc); /* There might be extra junk at the end of the descriptor */ if (buf[0] < rc) rc = buf[0]; rc = rc - (rc & 1); /* force a multiple of two */ } if (rc < 2) rc = (rc < 0 ? rc : -EINVAL); return rc; } static int usb_get_langid(struct usb_device *dev, unsigned char *tbuf) { int err; if (dev->have_langid) return 0; if (dev->string_langid < 0) return -EPIPE; err = usb_string_sub(dev, 0, 0, tbuf); /* If the string was reported but is malformed, default to english * (0x0409) */ if (err == -ENODATA || (err > 0 && err < 4)) { dev->string_langid = 0x0409; dev->have_langid = 1; dev_err(&dev->dev, "language id specifier not provided by device, defaulting to English\n"); return 0; } /* In case of all other errors, we assume the device is not able to * deal with strings at all. Set string_langid to -1 in order to * prevent any string to be retrieved from the device */ if (err < 0) { dev_info(&dev->dev, "string descriptor 0 read error: %d\n", err); dev->string_langid = -1; return -EPIPE; } /* always use the first langid listed */ dev->string_langid = tbuf[2] | (tbuf[3] << 8); dev->have_langid = 1; dev_dbg(&dev->dev, "default language 0x%04x\n", dev->string_langid); return 0; } /** * usb_string - returns UTF-8 version of a string descriptor * @dev: the device whose string descriptor is being retrieved * @index: the number of the descriptor * @buf: where to put the string * @size: how big is "buf"? * * Context: task context, might sleep. * * This converts the UTF-16LE encoded strings returned by devices, from * usb_get_string_descriptor(), to null-terminated UTF-8 encoded ones * that are more usable in most kernel contexts. Note that this function * chooses strings in the first language supported by the device. * * This call is synchronous, and may not be used in an interrupt context. * * Return: length of the string (>= 0) or usb_control_msg status (< 0). */ int usb_string(struct usb_device *dev, int index, char *buf, size_t size) { unsigned char *tbuf; int err; if (dev->state == USB_STATE_SUSPENDED) return -EHOSTUNREACH; if (size <= 0 || !buf) return -EINVAL; buf[0] = 0; if (index <= 0 || index >= 256) return -EINVAL; tbuf = kmalloc(256, GFP_NOIO); if (!tbuf) return -ENOMEM; err = usb_get_langid(dev, tbuf); if (err < 0) goto errout; err = usb_string_sub(dev, dev->string_langid, index, tbuf); if (err < 0) goto errout; size--; /* leave room for trailing NULL char in output buffer */ err = utf16s_to_utf8s((wchar_t *) &tbuf[2], (err - 2) / 2, UTF16_LITTLE_ENDIAN, buf, size); buf[err] = 0; if (tbuf[1] != USB_DT_STRING) dev_dbg(&dev->dev, "wrong descriptor type %02x for string %d (\"%s\")\n", tbuf[1], index, buf); errout: kfree(tbuf); return err; } EXPORT_SYMBOL_GPL(usb_string); /* one UTF-8-encoded 16-bit character has at most three bytes */ #define MAX_USB_STRING_SIZE (127 * 3 + 1) /** * usb_cache_string - read a string descriptor and cache it for later use * @udev: the device whose string descriptor is being read * @index: the descriptor index * * Return: A pointer to a kmalloc'ed buffer containing the descriptor string, * or %NULL if the index is 0 or the string could not be read. */ char *usb_cache_string(struct usb_device *udev, int index) { char *buf; char *smallbuf = NULL; int len; if (index <= 0) return NULL; buf = kmalloc(MAX_USB_STRING_SIZE, GFP_NOIO); if (buf) { len = usb_string(udev, index, buf, MAX_USB_STRING_SIZE); if (len > 0) { smallbuf = kmalloc(++len, GFP_NOIO); if (!smallbuf) return buf; memcpy(smallbuf, buf, len); } kfree(buf); } return smallbuf; } EXPORT_SYMBOL_GPL(usb_cache_string); /* * usb_get_device_descriptor - read the device descriptor * @udev: the device whose device descriptor should be read * * Context: task context, might sleep. * * Not exported, only for use by the core. If drivers really want to read * the device descriptor directly, they can call usb_get_descriptor() with * type = USB_DT_DEVICE and index = 0. * * Returns: a pointer to a dynamically allocated usb_device_descriptor * structure (which the caller must deallocate), or an ERR_PTR value. */ struct usb_device_descriptor *usb_get_device_descriptor(struct usb_device *udev) { struct usb_device_descriptor *desc; int ret; desc = kmalloc(sizeof(*desc), GFP_NOIO); if (!desc) return ERR_PTR(-ENOMEM); ret = usb_get_descriptor(udev, USB_DT_DEVICE, 0, desc, sizeof(*desc)); if (ret == sizeof(*desc)) return desc; if (ret >= 0) ret = -EMSGSIZE; kfree(desc); return ERR_PTR(ret); } /* * usb_set_isoch_delay - informs the device of the packet transmit delay * @dev: the device whose delay is to be informed * Context: task context, might sleep * * Since this is an optional request, we don't bother if it fails. */ int usb_set_isoch_delay(struct usb_device *dev) { /* skip hub devices */ if (dev->descriptor.bDeviceClass == USB_CLASS_HUB) return 0; /* skip non-SS/non-SSP devices */ if (dev->speed < USB_SPEED_SUPER) return 0; return usb_control_msg_send(dev, 0, USB_REQ_SET_ISOCH_DELAY, USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, dev->hub_delay, 0, NULL, 0, USB_CTRL_SET_TIMEOUT, GFP_NOIO); } /** * usb_get_status - issues a GET_STATUS call * @dev: the device whose status is being checked * @recip: USB_RECIP_*; for device, interface, or endpoint * @type: USB_STATUS_TYPE_*; for standard or PTM status types * @target: zero (for device), else interface or endpoint number * @data: pointer to two bytes of bitmap data * * Context: task context, might sleep. * * Returns device, interface, or endpoint status. Normally only of * interest to see if the device is self powered, or has enabled the * remote wakeup facility; or whether a bulk or interrupt endpoint * is halted ("stalled"). * * Bits in these status bitmaps are set using the SET_FEATURE request, * and cleared using the CLEAR_FEATURE request. The usb_clear_halt() * function should be used to clear halt ("stall") status. * * This call is synchronous, and may not be used in an interrupt context. * * Returns 0 and the status value in *@data (in host byte order) on success, * or else the status code from the underlying usb_control_msg() call. */ int usb_get_status(struct usb_device *dev, int recip, int type, int target, void *data) { int ret; void *status; int length; switch (type) { case USB_STATUS_TYPE_STANDARD: length = 2; break; case USB_STATUS_TYPE_PTM: if (recip != USB_RECIP_DEVICE) return -EINVAL; length = 4; break; default: return -EINVAL; } status = kmalloc(length, GFP_KERNEL); if (!status) return -ENOMEM; ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), USB_REQ_GET_STATUS, USB_DIR_IN | recip, USB_STATUS_TYPE_STANDARD, target, status, length, USB_CTRL_GET_TIMEOUT); switch (ret) { case 4: if (type != USB_STATUS_TYPE_PTM) { ret = -EIO; break; } *(u32 *) data = le32_to_cpu(*(__le32 *) status); ret = 0; break; case 2: if (type != USB_STATUS_TYPE_STANDARD) { ret = -EIO; break; } *(u16 *) data = le16_to_cpu(*(__le16 *) status); ret = 0; break; default: ret = -EIO; } kfree(status); return ret; } EXPORT_SYMBOL_GPL(usb_get_status); /** * usb_clear_halt - tells device to clear endpoint halt/stall condition * @dev: device whose endpoint is halted * @pipe: endpoint "pipe" being cleared * * Context: task context, might sleep. * * This is used to clear halt conditions for bulk and interrupt endpoints, * as reported by URB completion status. Endpoints that are halted are * sometimes referred to as being "stalled". Such endpoints are unable * to transmit or receive data until the halt status is cleared. Any URBs * queued for such an endpoint should normally be unlinked by the driver * before clearing the halt condition, as described in sections 5.7.5 * and 5.8.5 of the USB 2.0 spec. * * Note that control and isochronous endpoints don't halt, although control * endpoints report "protocol stall" (for unsupported requests) using the * same status code used to report a true stall. * * This call is synchronous, and may not be used in an interrupt context. * * Return: Zero on success, or else the status code returned by the * underlying usb_control_msg() call. */ int usb_clear_halt(struct usb_device *dev, int pipe) { int result; int endp = usb_pipeendpoint(pipe); if (usb_pipein(pipe)) endp |= USB_DIR_IN; /* we don't care if it wasn't halted first. in fact some devices * (like some ibmcam model 1 units) seem to expect hosts to make * this request for iso endpoints, which can't halt! */ result = usb_control_msg_send(dev, 0, USB_REQ_CLEAR_FEATURE, USB_RECIP_ENDPOINT, USB_ENDPOINT_HALT, endp, NULL, 0, USB_CTRL_SET_TIMEOUT, GFP_NOIO); /* don't un-halt or force to DATA0 except on success */ if (result) return result; /* NOTE: seems like Microsoft and Apple don't bother verifying * the clear "took", so some devices could lock up if you check... * such as the Hagiwara FlashGate DUAL. So we won't bother. * * NOTE: make sure the logic here doesn't diverge much from * the copy in usb-storage, for as long as we need two copies. */ usb_reset_endpoint(dev, endp); return 0; } EXPORT_SYMBOL_GPL(usb_clear_halt); static int create_intf_ep_devs(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); struct usb_host_interface *alt = intf->cur_altsetting; int i; if (intf->ep_devs_created || intf->unregistering) return 0; for (i = 0; i < alt->desc.bNumEndpoints; ++i) (void) usb_create_ep_devs(&intf->dev, &alt->endpoint[i], udev); intf->ep_devs_created = 1; return 0; } static void remove_intf_ep_devs(struct usb_interface *intf) { struct usb_host_interface *alt = intf->cur_altsetting; int i; if (!intf->ep_devs_created) return; for (i = 0; i < alt->desc.bNumEndpoints; ++i) usb_remove_ep_devs(&alt->endpoint[i]); intf->ep_devs_created = 0; } /** * usb_disable_endpoint -- Disable an endpoint by address * @dev: the device whose endpoint is being disabled * @epaddr: the endpoint's address. Endpoint number for output, * endpoint number + USB_DIR_IN for input * @reset_hardware: flag to erase any endpoint state stored in the * controller hardware * * Disables the endpoint for URB submission and nukes all pending URBs. * If @reset_hardware is set then also deallocates hcd/hardware state * for the endpoint. */ void usb_disable_endpoint(struct usb_device *dev, unsigned int epaddr, bool reset_hardware) { unsigned int epnum = epaddr & USB_ENDPOINT_NUMBER_MASK; struct usb_host_endpoint *ep; if (!dev) return; if (usb_endpoint_out(epaddr)) { ep = dev->ep_out[epnum]; if (reset_hardware && epnum != 0) dev->ep_out[epnum] = NULL; } else { ep = dev->ep_in[epnum]; if (reset_hardware && epnum != 0) dev->ep_in[epnum] = NULL; } if (ep) { ep->enabled = 0; usb_hcd_flush_endpoint(dev, ep); if (reset_hardware) usb_hcd_disable_endpoint(dev, ep); } } /** * usb_reset_endpoint - Reset an endpoint's state. * @dev: the device whose endpoint is to be reset * @epaddr: the endpoint's address. Endpoint number for output, * endpoint number + USB_DIR_IN for input * * Resets any host-side endpoint state such as the toggle bit, * sequence number or current window. */ void usb_reset_endpoint(struct usb_device *dev, unsigned int epaddr) { unsigned int epnum = epaddr & USB_ENDPOINT_NUMBER_MASK; struct usb_host_endpoint *ep; if (usb_endpoint_out(epaddr)) ep = dev->ep_out[epnum]; else ep = dev->ep_in[epnum]; if (ep) usb_hcd_reset_endpoint(dev, ep); } EXPORT_SYMBOL_GPL(usb_reset_endpoint); /** * usb_disable_interface -- Disable all endpoints for an interface * @dev: the device whose interface is being disabled * @intf: pointer to the interface descriptor * @reset_hardware: flag to erase any endpoint state stored in the * controller hardware * * Disables all the endpoints for the interface's current altsetting. */ void usb_disable_interface(struct usb_device *dev, struct usb_interface *intf, bool reset_hardware) { struct usb_host_interface *alt = intf->cur_altsetting; int i; for (i = 0; i < alt->desc.bNumEndpoints; ++i) { usb_disable_endpoint(dev, alt->endpoint[i].desc.bEndpointAddress, reset_hardware); } } /* * usb_disable_device_endpoints -- Disable all endpoints for a device * @dev: the device whose endpoints are being disabled * @skip_ep0: 0 to disable endpoint 0, 1 to skip it. */ static void usb_disable_device_endpoints(struct usb_device *dev, int skip_ep0) { struct usb_hcd *hcd = bus_to_hcd(dev->bus); int i; if (hcd->driver->check_bandwidth) { /* First pass: Cancel URBs, leave endpoint pointers intact. */ for (i = skip_ep0; i < 16; ++i) { usb_disable_endpoint(dev, i, false); usb_disable_endpoint(dev, i + USB_DIR_IN, false); } /* Remove endpoints from the host controller internal state */ mutex_lock(hcd->bandwidth_mutex); usb_hcd_alloc_bandwidth(dev, NULL, NULL, NULL); mutex_unlock(hcd->bandwidth_mutex); } /* Second pass: remove endpoint pointers */ for (i = skip_ep0; i < 16; ++i) { usb_disable_endpoint(dev, i, true); usb_disable_endpoint(dev, i + USB_DIR_IN, true); } } /** * usb_disable_device - Disable all the endpoints for a USB device * @dev: the device whose endpoints are being disabled * @skip_ep0: 0 to disable endpoint 0, 1 to skip it. * * Disables all the device's endpoints, potentially including endpoint 0. * Deallocates hcd/hardware state for the endpoints (nuking all or most * pending urbs) and usbcore state for the interfaces, so that usbcore * must usb_set_configuration() before any interfaces could be used. */ void usb_disable_device(struct usb_device *dev, int skip_ep0) { int i; /* getting rid of interfaces will disconnect * any drivers bound to them (a key side effect) */ if (dev->actconfig) { /* * FIXME: In order to avoid self-deadlock involving the * bandwidth_mutex, we have to mark all the interfaces * before unregistering any of them. */ for (i = 0; i < dev->actconfig->desc.bNumInterfaces; i++) dev->actconfig->interface[i]->unregistering = 1; for (i = 0; i < dev->actconfig->desc.bNumInterfaces; i++) { struct usb_interface *interface; /* remove this interface if it has been registered */ interface = dev->actconfig->interface[i]; if (!device_is_registered(&interface->dev)) continue; dev_dbg(&dev->dev, "unregistering interface %s\n", dev_name(&interface->dev)); remove_intf_ep_devs(interface); device_del(&interface->dev); } /* Now that the interfaces are unbound, nobody should * try to access them. */ for (i = 0; i < dev->actconfig->desc.bNumInterfaces; i++) { put_device(&dev->actconfig->interface[i]->dev); dev->actconfig->interface[i] = NULL; } usb_disable_usb2_hardware_lpm(dev); usb_unlocked_disable_lpm(dev); usb_disable_ltm(dev); dev->actconfig = NULL; if (dev->state == USB_STATE_CONFIGURED) usb_set_device_state(dev, USB_STATE_ADDRESS); } dev_dbg(&dev->dev, "%s nuking %s URBs\n", __func__, skip_ep0 ? "non-ep0" : "all"); usb_disable_device_endpoints(dev, skip_ep0); } /** * usb_enable_endpoint - Enable an endpoint for USB communications * @dev: the device whose interface is being enabled * @ep: the endpoint * @reset_ep: flag to reset the endpoint state * * Resets the endpoint state if asked, and sets dev->ep_{in,out} pointers. * For control endpoints, both the input and output sides are handled. */ void usb_enable_endpoint(struct usb_device *dev, struct usb_host_endpoint *ep, bool reset_ep) { int epnum = usb_endpoint_num(&ep->desc); int is_out = usb_endpoint_dir_out(&ep->desc); int is_control = usb_endpoint_xfer_control(&ep->desc); if (reset_ep) usb_hcd_reset_endpoint(dev, ep); if (is_out || is_control) dev->ep_out[epnum] = ep; if (!is_out || is_control) dev->ep_in[epnum] = ep; ep->enabled = 1; } /** * usb_enable_interface - Enable all the endpoints for an interface * @dev: the device whose interface is being enabled * @intf: pointer to the interface descriptor * @reset_eps: flag to reset the endpoints' state * * Enables all the endpoints for the interface's current altsetting. */ void usb_enable_interface(struct usb_device *dev, struct usb_interface *intf, bool reset_eps) { struct usb_host_interface *alt = intf->cur_altsetting; int i; for (i = 0; i < alt->desc.bNumEndpoints; ++i) usb_enable_endpoint(dev, &alt->endpoint[i], reset_eps); } /** * usb_set_interface - Makes a particular alternate setting be current * @dev: the device whose interface is being updated * @interface: the interface being updated * @alternate: the setting being chosen. * * Context: task context, might sleep. * * This is used to enable data transfers on interfaces that may not * be enabled by default. Not all devices support such configurability. * Only the driver bound to an interface may change its setting. * * Within any given configuration, each interface may have several * alternative settings. These are often used to control levels of * bandwidth consumption. For example, the default setting for a high * speed interrupt endpoint may not send more than 64 bytes per microframe, * while interrupt transfers of up to 3KBytes per microframe are legal. * Also, isochronous endpoints may never be part of an * interface's default setting. To access such bandwidth, alternate * interface settings must be made current. * * Note that in the Linux USB subsystem, bandwidth associated with * an endpoint in a given alternate setting is not reserved until an URB * is submitted that needs that bandwidth. Some other operating systems * allocate bandwidth early, when a configuration is chosen. * * xHCI reserves bandwidth and configures the alternate setting in * usb_hcd_alloc_bandwidth(). If it fails the original interface altsetting * may be disabled. Drivers cannot rely on any particular alternate * setting being in effect after a failure. * * This call is synchronous, and may not be used in an interrupt context. * Also, drivers must not change altsettings while urbs are scheduled for * endpoints in that interface; all such urbs must first be completed * (perhaps forced by unlinking). * * Return: Zero on success, or else the status code returned by the * underlying usb_control_msg() call. */ int usb_set_interface(struct usb_device *dev, int interface, int alternate) { struct usb_interface *iface; struct usb_host_interface *alt; struct usb_hcd *hcd = bus_to_hcd(dev->bus); int i, ret, manual = 0; unsigned int epaddr; unsigned int pipe; if (dev->state == USB_STATE_SUSPENDED) return -EHOSTUNREACH; iface = usb_ifnum_to_if(dev, interface); if (!iface) { dev_dbg(&dev->dev, "selecting invalid interface %d\n", interface); return -EINVAL; } if (iface->unregistering) return -ENODEV; alt = usb_altnum_to_altsetting(iface, alternate); if (!alt) { dev_warn(&dev->dev, "selecting invalid altsetting %d\n", alternate); return -EINVAL; } /* * usb3 hosts configure the interface in usb_hcd_alloc_bandwidth, * including freeing dropped endpoint ring buffers. * Make sure the interface endpoints are flushed before that */ usb_disable_interface(dev, iface, false); /* Make sure we have enough bandwidth for this alternate interface. * Remove the current alt setting and add the new alt setting. */ mutex_lock(hcd->bandwidth_mutex); /* Disable LPM, and re-enable it once the new alt setting is installed, * so that the xHCI driver can recalculate the U1/U2 timeouts. */ if (usb_disable_lpm(dev)) { dev_err(&iface->dev, "%s Failed to disable LPM\n", __func__); mutex_unlock(hcd->bandwidth_mutex); return -ENOMEM; } /* Changing alt-setting also frees any allocated streams */ for (i = 0; i < iface->cur_altsetting->desc.bNumEndpoints; i++) iface->cur_altsetting->endpoint[i].streams = 0; ret = usb_hcd_alloc_bandwidth(dev, NULL, iface->cur_altsetting, alt); if (ret < 0) { dev_info(&dev->dev, "Not enough bandwidth for altsetting %d\n", alternate); usb_enable_lpm(dev); mutex_unlock(hcd->bandwidth_mutex); return ret; } if (dev->quirks & USB_QUIRK_NO_SET_INTF) ret = -EPIPE; else ret = usb_control_msg_send(dev, 0, USB_REQ_SET_INTERFACE, USB_RECIP_INTERFACE, alternate, interface, NULL, 0, 5000, GFP_NOIO); /* 9.4.10 says devices don't need this and are free to STALL the * request if the interface only has one alternate setting. */ if (ret == -EPIPE && iface->num_altsetting == 1) { dev_dbg(&dev->dev, "manual set_interface for iface %d, alt %d\n", interface, alternate); manual = 1; } else if (ret) { /* Re-instate the old alt setting */ usb_hcd_alloc_bandwidth(dev, NULL, alt, iface->cur_altsetting); usb_enable_lpm(dev); mutex_unlock(hcd->bandwidth_mutex); return ret; } mutex_unlock(hcd->bandwidth_mutex); /* FIXME drivers shouldn't need to replicate/bugfix the logic here * when they implement async or easily-killable versions of this or * other "should-be-internal" functions (like clear_halt). * should hcd+usbcore postprocess control requests? */ /* prevent submissions using previous endpoint settings */ if (iface->cur_altsetting != alt) { remove_intf_ep_devs(iface); usb_remove_sysfs_intf_files(iface); } usb_disable_interface(dev, iface, true); iface->cur_altsetting = alt; /* Now that the interface is installed, re-enable LPM. */ usb_unlocked_enable_lpm(dev); /* If the interface only has one altsetting and the device didn't * accept the request, we attempt to carry out the equivalent action * by manually clearing the HALT feature for each endpoint in the * new altsetting. */ if (manual) { for (i = 0; i < alt->desc.bNumEndpoints; i++) { epaddr = alt->endpoint[i].desc.bEndpointAddress; pipe = __create_pipe(dev, USB_ENDPOINT_NUMBER_MASK & epaddr) | (usb_endpoint_out(epaddr) ? USB_DIR_OUT : USB_DIR_IN); usb_clear_halt(dev, pipe); } } /* 9.1.1.5: reset toggles for all endpoints in the new altsetting * * Note: * Despite EP0 is always present in all interfaces/AS, the list of * endpoints from the descriptor does not contain EP0. Due to its * omnipresence one might expect EP0 being considered "affected" by * any SetInterface request and hence assume toggles need to be reset. * However, EP0 toggles are re-synced for every individual transfer * during the SETUP stage - hence EP0 toggles are "don't care" here. * (Likewise, EP0 never "halts" on well designed devices.) */ usb_enable_interface(dev, iface, true); if (device_is_registered(&iface->dev)) { usb_create_sysfs_intf_files(iface); create_intf_ep_devs(iface); } return 0; } EXPORT_SYMBOL_GPL(usb_set_interface); /** * usb_reset_configuration - lightweight device reset * @dev: the device whose configuration is being reset * * This issues a standard SET_CONFIGURATION request to the device using * the current configuration. The effect is to reset most USB-related * state in the device, including interface altsettings (reset to zero), * endpoint halts (cleared), and endpoint state (only for bulk and interrupt * endpoints). Other usbcore state is unchanged, including bindings of * usb device drivers to interfaces. * * Because this affects multiple interfaces, avoid using this with composite * (multi-interface) devices. Instead, the driver for each interface may * use usb_set_interface() on the interfaces it claims. Be careful though; * some devices don't support the SET_INTERFACE request, and others won't * reset all the interface state (notably endpoint state). Resetting the whole * configuration would affect other drivers' interfaces. * * The caller must own the device lock. * * Return: Zero on success, else a negative error code. * * If this routine fails the device will probably be in an unusable state * with endpoints disabled, and interfaces only partially enabled. */ int usb_reset_configuration(struct usb_device *dev) { int i, retval; struct usb_host_config *config; struct usb_hcd *hcd = bus_to_hcd(dev->bus); if (dev->state == USB_STATE_SUSPENDED) return -EHOSTUNREACH; /* caller must have locked the device and must own * the usb bus readlock (so driver bindings are stable); * calls during probe() are fine */ usb_disable_device_endpoints(dev, 1); /* skip ep0*/ config = dev->actconfig; retval = 0; mutex_lock(hcd->bandwidth_mutex); /* Disable LPM, and re-enable it once the configuration is reset, so * that the xHCI driver can recalculate the U1/U2 timeouts. */ if (usb_disable_lpm(dev)) { dev_err(&dev->dev, "%s Failed to disable LPM\n", __func__); mutex_unlock(hcd->bandwidth_mutex); return -ENOMEM; } /* xHCI adds all endpoints in usb_hcd_alloc_bandwidth */ retval = usb_hcd_alloc_bandwidth(dev, config, NULL, NULL); if (retval < 0) { usb_enable_lpm(dev); mutex_unlock(hcd->bandwidth_mutex); return retval; } retval = usb_control_msg_send(dev, 0, USB_REQ_SET_CONFIGURATION, 0, config->desc.bConfigurationValue, 0, NULL, 0, USB_CTRL_SET_TIMEOUT, GFP_NOIO); if (retval) { usb_hcd_alloc_bandwidth(dev, NULL, NULL, NULL); usb_enable_lpm(dev); mutex_unlock(hcd->bandwidth_mutex); return retval; } mutex_unlock(hcd->bandwidth_mutex); /* re-init hc/hcd interface/endpoint state */ for (i = 0; i < config->desc.bNumInterfaces; i++) { struct usb_interface *intf = config->interface[i]; struct usb_host_interface *alt; alt = usb_altnum_to_altsetting(intf, 0); /* No altsetting 0? We'll assume the first altsetting. * We could use a GetInterface call, but if a device is * so non-compliant that it doesn't have altsetting 0 * then I wouldn't trust its reply anyway. */ if (!alt) alt = &intf->altsetting[0]; if (alt != intf->cur_altsetting) { remove_intf_ep_devs(intf); usb_remove_sysfs_intf_files(intf); } intf->cur_altsetting = alt; usb_enable_interface(dev, intf, true); if (device_is_registered(&intf->dev)) { usb_create_sysfs_intf_files(intf); create_intf_ep_devs(intf); } } /* Now that the interfaces are installed, re-enable LPM. */ usb_unlocked_enable_lpm(dev); return 0; } EXPORT_SYMBOL_GPL(usb_reset_configuration); static void usb_release_interface(struct device *dev) { struct usb_interface *intf = to_usb_interface(dev); struct usb_interface_cache *intfc = altsetting_to_usb_interface_cache(intf->altsetting); kref_put(&intfc->ref, usb_release_interface_cache); usb_put_dev(interface_to_usbdev(intf)); of_node_put(dev->of_node); kfree(intf); } /* * usb_deauthorize_interface - deauthorize an USB interface * * @intf: USB interface structure */ void usb_deauthorize_interface(struct usb_interface *intf) { struct device *dev = &intf->dev; device_lock(dev->parent); if (intf->authorized) { device_lock(dev); intf->authorized = 0; device_unlock(dev); usb_forced_unbind_intf(intf); } device_unlock(dev->parent); } /* * usb_authorize_interface - authorize an USB interface * * @intf: USB interface structure */ void usb_authorize_interface(struct usb_interface *intf) { struct device *dev = &intf->dev; if (!intf->authorized) { device_lock(dev); intf->authorized = 1; /* authorize interface */ device_unlock(dev); } } static int usb_if_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct usb_device *usb_dev; const struct usb_interface *intf; const struct usb_host_interface *alt; intf = to_usb_interface(dev); usb_dev = interface_to_usbdev(intf); alt = intf->cur_altsetting; if (add_uevent_var(env, "INTERFACE=%d/%d/%d", alt->desc.bInterfaceClass, alt->desc.bInterfaceSubClass, alt->desc.bInterfaceProtocol)) return -ENOMEM; if (add_uevent_var(env, "MODALIAS=usb:" "v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02Xic%02Xisc%02Xip%02Xin%02X", le16_to_cpu(usb_dev->descriptor.idVendor), le16_to_cpu(usb_dev->descriptor.idProduct), le16_to_cpu(usb_dev->descriptor.bcdDevice), usb_dev->descriptor.bDeviceClass, usb_dev->descriptor.bDeviceSubClass, usb_dev->descriptor.bDeviceProtocol, alt->desc.bInterfaceClass, alt->desc.bInterfaceSubClass, alt->desc.bInterfaceProtocol, alt->desc.bInterfaceNumber)) return -ENOMEM; return 0; } struct device_type usb_if_device_type = { .name = "usb_interface", .release = usb_release_interface, .uevent = usb_if_uevent, }; static struct usb_interface_assoc_descriptor *find_iad(struct usb_device *dev, struct usb_host_config *config, u8 inum) { struct usb_interface_assoc_descriptor *retval = NULL; struct usb_interface_assoc_descriptor *intf_assoc; int first_intf; int last_intf; int i; for (i = 0; (i < USB_MAXIADS && config->intf_assoc[i]); i++) { intf_assoc = config->intf_assoc[i]; if (intf_assoc->bInterfaceCount == 0) continue; first_intf = intf_assoc->bFirstInterface; last_intf = first_intf + (intf_assoc->bInterfaceCount - 1); if (inum >= first_intf && inum <= last_intf) { if (!retval) retval = intf_assoc; else dev_err(&dev->dev, "Interface #%d referenced" " by multiple IADs\n", inum); } } return retval; } /* * Internal function to queue a device reset * See usb_queue_reset_device() for more details */ static void __usb_queue_reset_device(struct work_struct *ws) { int rc; struct usb_interface *iface = container_of(ws, struct usb_interface, reset_ws); struct usb_device *udev = interface_to_usbdev(iface); rc = usb_lock_device_for_reset(udev, iface); if (rc >= 0) { usb_reset_device(udev); usb_unlock_device(udev); } usb_put_intf(iface); /* Undo _get_ in usb_queue_reset_device() */ } /* * Internal function to set the wireless_status sysfs attribute * See usb_set_wireless_status() for more details */ static void __usb_wireless_status_intf(struct work_struct *ws) { struct usb_interface *iface = container_of(ws, struct usb_interface, wireless_status_work); device_lock(iface->dev.parent); if (iface->sysfs_files_created) usb_update_wireless_status_attr(iface); device_unlock(iface->dev.parent); usb_put_intf(iface); /* Undo _get_ in usb_set_wireless_status() */ } /** * usb_set_wireless_status - sets the wireless_status struct member * @iface: the interface to modify * @status: the new wireless status * * Set the wireless_status struct member to the new value, and emit * sysfs changes as necessary. * * Returns: 0 on success, -EALREADY if already set. */ int usb_set_wireless_status(struct usb_interface *iface, enum usb_wireless_status status) { if (iface->wireless_status == status) return -EALREADY; usb_get_intf(iface); iface->wireless_status = status; schedule_work(&iface->wireless_status_work); return 0; } EXPORT_SYMBOL_GPL(usb_set_wireless_status); /* * usb_set_configuration - Makes a particular device setting be current * @dev: the device whose configuration is being updated * @configuration: the configuration being chosen. * * Context: task context, might sleep. Caller holds device lock. * * This is used to enable non-default device modes. Not all devices * use this kind of configurability; many devices only have one * configuration. * * @configuration is the value of the configuration to be installed. * According to the USB spec (e.g. section 9.1.1.5), configuration values * must be non-zero; a value of zero indicates that the device in * unconfigured. However some devices erroneously use 0 as one of their * configuration values. To help manage such devices, this routine will * accept @configuration = -1 as indicating the device should be put in * an unconfigured state. * * USB device configurations may affect Linux interoperability, * power consumption and the functionality available. For example, * the default configuration is limited to using 100mA of bus power, * so that when certain device functionality requires more power, * and the device is bus powered, that functionality should be in some * non-default device configuration. Other device modes may also be * reflected as configuration options, such as whether two ISDN * channels are available independently; and choosing between open * standard device protocols (like CDC) or proprietary ones. * * Note that a non-authorized device (dev->authorized == 0) will only * be put in unconfigured mode. * * Note that USB has an additional level of device configurability, * associated with interfaces. That configurability is accessed using * usb_set_interface(). * * This call is synchronous. The calling context must be able to sleep, * must own the device lock, and must not hold the driver model's USB * bus mutex; usb interface driver probe() methods cannot use this routine. * * Returns zero on success, or else the status code returned by the * underlying call that failed. On successful completion, each interface * in the original device configuration has been destroyed, and each one * in the new configuration has been probed by all relevant usb device * drivers currently known to the kernel. */ int usb_set_configuration(struct usb_device *dev, int configuration) { int i, ret; struct usb_host_config *cp = NULL; struct usb_interface **new_interfaces = NULL; struct usb_hcd *hcd = bus_to_hcd(dev->bus); int n, nintf; if (dev->authorized == 0 || configuration == -1) configuration = 0; else { for (i = 0; i < dev->descriptor.bNumConfigurations; i++) { if (dev->config[i].desc.bConfigurationValue == configuration) { cp = &dev->config[i]; break; } } } if ((!cp && configuration != 0)) return -EINVAL; /* The USB spec says configuration 0 means unconfigured. * But if a device includes a configuration numbered 0, * we will accept it as a correctly configured state. * Use -1 if you really want to unconfigure the device. */ if (cp && configuration == 0) dev_warn(&dev->dev, "config 0 descriptor??\n"); /* Allocate memory for new interfaces before doing anything else, * so that if we run out then nothing will have changed. */ n = nintf = 0; if (cp) { nintf = cp->desc.bNumInterfaces; new_interfaces = kmalloc_array(nintf, sizeof(*new_interfaces), GFP_NOIO); if (!new_interfaces) return -ENOMEM; for (; n < nintf; ++n) { new_interfaces[n] = kzalloc( sizeof(struct usb_interface), GFP_NOIO); if (!new_interfaces[n]) { ret = -ENOMEM; free_interfaces: while (--n >= 0) kfree(new_interfaces[n]); kfree(new_interfaces); return ret; } } i = dev->bus_mA - usb_get_max_power(dev, cp); if (i < 0) dev_warn(&dev->dev, "new config #%d exceeds power " "limit by %dmA\n", configuration, -i); } /* Wake up the device so we can send it the Set-Config request */ ret = usb_autoresume_device(dev); if (ret) goto free_interfaces; /* if it's already configured, clear out old state first. * getting rid of old interfaces means unbinding their drivers. */ if (dev->state != USB_STATE_ADDRESS) usb_disable_device(dev, 1); /* Skip ep0 */ /* Get rid of pending async Set-Config requests for this device */ cancel_async_set_config(dev); /* Make sure we have bandwidth (and available HCD resources) for this * configuration. Remove endpoints from the schedule if we're dropping * this configuration to set configuration 0. After this point, the * host controller will not allow submissions to dropped endpoints. If * this call fails, the device state is unchanged. */ mutex_lock(hcd->bandwidth_mutex); /* Disable LPM, and re-enable it once the new configuration is * installed, so that the xHCI driver can recalculate the U1/U2 * timeouts. */ if (dev->actconfig && usb_disable_lpm(dev)) { dev_err(&dev->dev, "%s Failed to disable LPM\n", __func__); mutex_unlock(hcd->bandwidth_mutex); ret = -ENOMEM; goto free_interfaces; } ret = usb_hcd_alloc_bandwidth(dev, cp, NULL, NULL); if (ret < 0) { if (dev->actconfig) usb_enable_lpm(dev); mutex_unlock(hcd->bandwidth_mutex); usb_autosuspend_device(dev); goto free_interfaces; } /* * Initialize the new interface structures and the * hc/hcd/usbcore interface/endpoint state. */ for (i = 0; i < nintf; ++i) { struct usb_interface_cache *intfc; struct usb_interface *intf; struct usb_host_interface *alt; u8 ifnum; cp->interface[i] = intf = new_interfaces[i]; intfc = cp->intf_cache[i]; intf->altsetting = intfc->altsetting; intf->num_altsetting = intfc->num_altsetting; intf->authorized = !!HCD_INTF_AUTHORIZED(hcd); kref_get(&intfc->ref); alt = usb_altnum_to_altsetting(intf, 0); /* No altsetting 0? We'll assume the first altsetting. * We could use a GetInterface call, but if a device is * so non-compliant that it doesn't have altsetting 0 * then I wouldn't trust its reply anyway. */ if (!alt) alt = &intf->altsetting[0]; ifnum = alt->desc.bInterfaceNumber; intf->intf_assoc = find_iad(dev, cp, ifnum); intf->cur_altsetting = alt; usb_enable_interface(dev, intf, true); intf->dev.parent = &dev->dev; if (usb_of_has_combined_node(dev)) { device_set_of_node_from_dev(&intf->dev, &dev->dev); } else { intf->dev.of_node = usb_of_get_interface_node(dev, configuration, ifnum); } ACPI_COMPANION_SET(&intf->dev, ACPI_COMPANION(&dev->dev)); intf->dev.driver = NULL; intf->dev.bus = &usb_bus_type; intf->dev.type = &usb_if_device_type; intf->dev.groups = usb_interface_groups; INIT_WORK(&intf->reset_ws, __usb_queue_reset_device); INIT_WORK(&intf->wireless_status_work, __usb_wireless_status_intf); intf->minor = -1; device_initialize(&intf->dev); pm_runtime_no_callbacks(&intf->dev); dev_set_name(&intf->dev, "%d-%s:%d.%d", dev->bus->busnum, dev->devpath, configuration, ifnum); usb_get_dev(dev); } kfree(new_interfaces); ret = usb_control_msg_send(dev, 0, USB_REQ_SET_CONFIGURATION, 0, configuration, 0, NULL, 0, USB_CTRL_SET_TIMEOUT, GFP_NOIO); if (ret && cp) { /* * All the old state is gone, so what else can we do? * The device is probably useless now anyway. */ usb_hcd_alloc_bandwidth(dev, NULL, NULL, NULL); for (i = 0; i < nintf; ++i) { usb_disable_interface(dev, cp->interface[i], true); put_device(&cp->interface[i]->dev); cp->interface[i] = NULL; } cp = NULL; } dev->actconfig = cp; mutex_unlock(hcd->bandwidth_mutex); if (!cp) { usb_set_device_state(dev, USB_STATE_ADDRESS); /* Leave LPM disabled while the device is unconfigured. */ usb_autosuspend_device(dev); return ret; } usb_set_device_state(dev, USB_STATE_CONFIGURED); if (cp->string == NULL && !(dev->quirks & USB_QUIRK_CONFIG_INTF_STRINGS)) cp->string = usb_cache_string(dev, cp->desc.iConfiguration); /* Now that the interfaces are installed, re-enable LPM. */ usb_unlocked_enable_lpm(dev); /* Enable LTM if it was turned off by usb_disable_device. */ usb_enable_ltm(dev); /* Now that all the interfaces are set up, register them * to trigger binding of drivers to interfaces. probe() * routines may install different altsettings and may * claim() any interfaces not yet bound. Many class drivers * need that: CDC, audio, video, etc. */ for (i = 0; i < nintf; ++i) { struct usb_interface *intf = cp->interface[i]; if (intf->dev.of_node && !of_device_is_available(intf->dev.of_node)) { dev_info(&dev->dev, "skipping disabled interface %d\n", intf->cur_altsetting->desc.bInterfaceNumber); continue; } dev_dbg(&dev->dev, "adding %s (config #%d, interface %d)\n", dev_name(&intf->dev), configuration, intf->cur_altsetting->desc.bInterfaceNumber); device_enable_async_suspend(&intf->dev); ret = device_add(&intf->dev); if (ret != 0) { dev_err(&dev->dev, "device_add(%s) --> %d\n", dev_name(&intf->dev), ret); continue; } create_intf_ep_devs(intf); } usb_autosuspend_device(dev); return 0; } EXPORT_SYMBOL_GPL(usb_set_configuration); static LIST_HEAD(set_config_list); static DEFINE_SPINLOCK(set_config_lock); struct set_config_request { struct usb_device *udev; int config; struct work_struct work; struct list_head node; }; /* Worker routine for usb_driver_set_configuration() */ static void driver_set_config_work(struct work_struct *work) { struct set_config_request *req = container_of(work, struct set_config_request, work); struct usb_device *udev = req->udev; usb_lock_device(udev); spin_lock(&set_config_lock); list_del(&req->node); spin_unlock(&set_config_lock); if (req->config >= -1) /* Is req still valid? */ usb_set_configuration(udev, req->config); usb_unlock_device(udev); usb_put_dev(udev); kfree(req); } /* Cancel pending Set-Config requests for a device whose configuration * was just changed */ static void cancel_async_set_config(struct usb_device *udev) { struct set_config_request *req; spin_lock(&set_config_lock); list_for_each_entry(req, &set_config_list, node) { if (req->udev == udev) req->config = -999; /* Mark as cancelled */ } spin_unlock(&set_config_lock); } /** * usb_driver_set_configuration - Provide a way for drivers to change device configurations * @udev: the device whose configuration is being updated * @config: the configuration being chosen. * Context: In process context, must be able to sleep * * Device interface drivers are not allowed to change device configurations. * This is because changing configurations will destroy the interface the * driver is bound to and create new ones; it would be like a floppy-disk * driver telling the computer to replace the floppy-disk drive with a * tape drive! * * Still, in certain specialized circumstances the need may arise. This * routine gets around the normal restrictions by using a work thread to * submit the change-config request. * * Return: 0 if the request was successfully queued, error code otherwise. * The caller has no way to know whether the queued request will eventually * succeed. */ int usb_driver_set_configuration(struct usb_device *udev, int config) { struct set_config_request *req; req = kmalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; req->udev = udev; req->config = config; INIT_WORK(&req->work, driver_set_config_work); spin_lock(&set_config_lock); list_add(&req->node, &set_config_list); spin_unlock(&set_config_lock); usb_get_dev(udev); schedule_work(&req->work); return 0; } EXPORT_SYMBOL_GPL(usb_driver_set_configuration); /** * cdc_parse_cdc_header - parse the extra headers present in CDC devices * @hdr: the place to put the results of the parsing * @intf: the interface for which parsing is requested * @buffer: pointer to the extra headers to be parsed * @buflen: length of the extra headers * * This evaluates the extra headers present in CDC devices which * bind the interfaces for data and control and provide details * about the capabilities of the device. * * Return: number of descriptors parsed or -EINVAL * if the header is contradictory beyond salvage */ int cdc_parse_cdc_header(struct usb_cdc_parsed_header *hdr, struct usb_interface *intf, u8 *buffer, int buflen) { /* duplicates are ignored */ struct usb_cdc_union_desc *union_header = NULL; /* duplicates are not tolerated */ struct usb_cdc_header_desc *header = NULL; struct usb_cdc_ether_desc *ether = NULL; struct usb_cdc_mdlm_detail_desc *detail = NULL; struct usb_cdc_mdlm_desc *desc = NULL; unsigned int elength; int cnt = 0; memset(hdr, 0x00, sizeof(struct usb_cdc_parsed_header)); hdr->phonet_magic_present = false; while (buflen > 0) { elength = buffer[0]; if (!elength) { dev_err(&intf->dev, "skipping garbage byte\n"); elength = 1; goto next_desc; } if ((buflen < elength) || (elength < 3)) { dev_err(&intf->dev, "invalid descriptor buffer length\n"); break; } if (buffer[1] != USB_DT_CS_INTERFACE) { dev_err(&intf->dev, "skipping garbage\n"); goto next_desc; } switch (buffer[2]) { case USB_CDC_UNION_TYPE: /* we've found it */ if (elength < sizeof(struct usb_cdc_union_desc)) goto next_desc; if (union_header) { dev_err(&intf->dev, "More than one union descriptor, skipping ...\n"); goto next_desc; } union_header = (struct usb_cdc_union_desc *)buffer; break; case USB_CDC_COUNTRY_TYPE: if (elength < sizeof(struct usb_cdc_country_functional_desc)) goto next_desc; hdr->usb_cdc_country_functional_desc = (struct usb_cdc_country_functional_desc *)buffer; break; case USB_CDC_HEADER_TYPE: if (elength != sizeof(struct usb_cdc_header_desc)) goto next_desc; if (header) return -EINVAL; header = (struct usb_cdc_header_desc *)buffer; break; case USB_CDC_ACM_TYPE: if (elength < sizeof(struct usb_cdc_acm_descriptor)) goto next_desc; hdr->usb_cdc_acm_descriptor = (struct usb_cdc_acm_descriptor *)buffer; break; case USB_CDC_ETHERNET_TYPE: if (elength != sizeof(struct usb_cdc_ether_desc)) goto next_desc; if (ether) return -EINVAL; ether = (struct usb_cdc_ether_desc *)buffer; break; case USB_CDC_CALL_MANAGEMENT_TYPE: if (elength < sizeof(struct usb_cdc_call_mgmt_descriptor)) goto next_desc; hdr->usb_cdc_call_mgmt_descriptor = (struct usb_cdc_call_mgmt_descriptor *)buffer; break; case USB_CDC_DMM_TYPE: if (elength < sizeof(struct usb_cdc_dmm_desc)) goto next_desc; hdr->usb_cdc_dmm_desc = (struct usb_cdc_dmm_desc *)buffer; break; case USB_CDC_MDLM_TYPE: if (elength < sizeof(struct usb_cdc_mdlm_desc)) goto next_desc; if (desc) return -EINVAL; desc = (struct usb_cdc_mdlm_desc *)buffer; break; case USB_CDC_MDLM_DETAIL_TYPE: if (elength < sizeof(struct usb_cdc_mdlm_detail_desc)) goto next_desc; if (detail) return -EINVAL; detail = (struct usb_cdc_mdlm_detail_desc *)buffer; break; case USB_CDC_NCM_TYPE: if (elength < sizeof(struct usb_cdc_ncm_desc)) goto next_desc; hdr->usb_cdc_ncm_desc = (struct usb_cdc_ncm_desc *)buffer; break; case USB_CDC_MBIM_TYPE: if (elength < sizeof(struct usb_cdc_mbim_desc)) goto next_desc; hdr->usb_cdc_mbim_desc = (struct usb_cdc_mbim_desc *)buffer; break; case USB_CDC_MBIM_EXTENDED_TYPE: if (elength < sizeof(struct usb_cdc_mbim_extended_desc)) break; hdr->usb_cdc_mbim_extended_desc = (struct usb_cdc_mbim_extended_desc *)buffer; break; case CDC_PHONET_MAGIC_NUMBER: hdr->phonet_magic_present = true; break; default: /* * there are LOTS more CDC descriptors that * could legitimately be found here. */ dev_dbg(&intf->dev, "Ignoring descriptor: type %02x, length %ud\n", buffer[2], elength); goto next_desc; } cnt++; next_desc: buflen -= elength; buffer += elength; } hdr->usb_cdc_union_desc = union_header; hdr->usb_cdc_header_desc = header; hdr->usb_cdc_mdlm_detail_desc = detail; hdr->usb_cdc_mdlm_desc = desc; hdr->usb_cdc_ether_desc = ether; return cnt; } EXPORT_SYMBOL(cdc_parse_cdc_header);
2 5 5 4 2 4 5 3 5 3 5 4 5 5 5 5 4 4 6 2 5 6 6 6 5 1 3 3 3 4 6 1 5 5 4 4 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 // SPDX-License-Identifier: GPL-2.0-only /* * CAIA Delay-Gradient (CDG) congestion control * * This implementation is based on the paper: * D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using * delay gradients." In IFIP Networking, pages 328-341. Springer, 2011. * * Scavenger traffic (Less-than-Best-Effort) should disable coexistence * heuristics using parameters use_shadow=0 and use_ineff=0. * * Parameters window, backoff_beta, and backoff_factor are crucial for * throughput and delay. Future work is needed to determine better defaults, * and to provide guidelines for use in different environments/contexts. * * Except for window, knobs are configured via /sys/module/tcp_cdg/parameters/. * Parameter window is only configurable when loading tcp_cdg as a module. * * Notable differences from paper/FreeBSD: * o Using Hybrid Slow start and Proportional Rate Reduction. * o Add toggle for shadow window mechanism. Suggested by David Hayes. * o Add toggle for non-congestion loss tolerance. * o Scaling parameter G is changed to a backoff factor; * conversion is given by: backoff_factor = 1000/(G * window). * o Limit shadow window to 2 * cwnd, or to cwnd when application limited. * o More accurate e^-x. */ #include <linux/kernel.h> #include <linux/random.h> #include <linux/module.h> #include <linux/sched/clock.h> #include <net/tcp.h> #define HYSTART_ACK_TRAIN 1 #define HYSTART_DELAY 2 static int window __read_mostly = 8; static unsigned int backoff_beta __read_mostly = 0.7071 * 1024; /* sqrt 0.5 */ static unsigned int backoff_factor __read_mostly = 42; static unsigned int hystart_detect __read_mostly = 3; static unsigned int use_ineff __read_mostly = 5; static bool use_shadow __read_mostly = true; static bool use_tolerance __read_mostly; module_param(window, int, 0444); MODULE_PARM_DESC(window, "gradient window size (power of two <= 256)"); module_param(backoff_beta, uint, 0644); MODULE_PARM_DESC(backoff_beta, "backoff beta (0-1024)"); module_param(backoff_factor, uint, 0644); MODULE_PARM_DESC(backoff_factor, "backoff probability scale factor"); module_param(hystart_detect, uint, 0644); MODULE_PARM_DESC(hystart_detect, "use Hybrid Slow start " "(0: disabled, 1: ACK train, 2: delay threshold, 3: both)"); module_param(use_ineff, uint, 0644); MODULE_PARM_DESC(use_ineff, "use ineffectual backoff detection (threshold)"); module_param(use_shadow, bool, 0644); MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); module_param(use_tolerance, bool, 0644); MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); struct cdg_minmax { union { struct { s32 min; s32 max; }; u64 v64; }; }; enum cdg_state { CDG_UNKNOWN = 0, CDG_NONFULL = 1, CDG_FULL = 2, CDG_BACKOFF = 3, }; struct cdg { struct cdg_minmax rtt; struct cdg_minmax rtt_prev; struct cdg_minmax *gradients; struct cdg_minmax gsum; bool gfilled; u8 tail; u8 state; u8 delack; u32 rtt_seq; u32 shadow_wnd; u16 backoff_cnt; u16 sample_cnt; s32 delay_min; u32 last_ack; u32 round_start; }; /** * nexp_u32 - negative base-e exponential * @ux: x in units of micro * * Returns exp(ux * -1e-6) * U32_MAX. */ static u32 __pure nexp_u32(u32 ux) { static const u16 v[] = { /* exp(-x)*65536-1 for x = 0, 0.000256, 0.000512, ... */ 65535, 65518, 65501, 65468, 65401, 65267, 65001, 64470, 63422, 61378, 57484, 50423, 38795, 22965, 8047, 987, 14, }; u32 msb = ux >> 8; u32 res; int i; /* Cut off when ux >= 2^24 (actual result is <= 222/U32_MAX). */ if (msb > U16_MAX) return 0; /* Scale first eight bits linearly: */ res = U32_MAX - (ux & 0xff) * (U32_MAX / 1000000); /* Obtain e^(x + y + ...) by computing e^x * e^y * ...: */ for (i = 1; msb; i++, msb >>= 1) { u32 y = v[i & -(msb & 1)] + U32_C(1); res = ((u64)res * y) >> 16; } return res; } /* Based on the HyStart algorithm (by Ha et al.) that is implemented in * tcp_cubic. Differences/experimental changes: * o Using Hayes' delayed ACK filter. * o Using a usec clock for the ACK train. * o Reset ACK train when application limited. * o Invoked at any cwnd (i.e. also when cwnd < 16). * o Invoked only when cwnd < ssthresh (i.e. not when cwnd == ssthresh). */ static void tcp_cdg_hystart_update(struct sock *sk) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->delay_min = min_not_zero(ca->delay_min, ca->rtt.min); if (ca->delay_min == 0) return; if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now_us = tp->tcp_mstamp; if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { ca->last_ack = now_us; ca->round_start = now_us; } else if (before(now_us, ca->last_ack + 3000)) { u32 base_owd = max(ca->delay_min / 2U, 125U); ca->last_ack = now_us; if (after(now_us, ca->round_start + base_owd)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINCWND, tcp_snd_cwnd(tp)); tp->snd_ssthresh = tcp_snd_cwnd(tp); return; } } } if (hystart_detect & HYSTART_DELAY) { if (ca->sample_cnt < 8) { ca->sample_cnt++; } else { s32 thresh = max(ca->delay_min + ca->delay_min / 8U, 125U); if (ca->rtt.min > thresh) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYCWND, tcp_snd_cwnd(tp)); tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } } static s32 tcp_cdg_grad(struct cdg *ca) { s32 gmin = ca->rtt.min - ca->rtt_prev.min; s32 gmax = ca->rtt.max - ca->rtt_prev.max; s32 grad; if (ca->gradients) { ca->gsum.min += gmin - ca->gradients[ca->tail].min; ca->gsum.max += gmax - ca->gradients[ca->tail].max; ca->gradients[ca->tail].min = gmin; ca->gradients[ca->tail].max = gmax; ca->tail = (ca->tail + 1) & (window - 1); gmin = ca->gsum.min; gmax = ca->gsum.max; } /* We keep sums to ignore gradients during cwnd reductions; * the paper's smoothed gradients otherwise simplify to: * (rtt_latest - rtt_oldest) / window. * * We also drop division by window here. */ grad = gmin > 0 ? gmin : gmax; /* Extrapolate missing values in gradient window: */ if (!ca->gfilled) { if (!ca->gradients && window > 1) grad *= window; /* Memory allocation failed. */ else if (ca->tail == 0) ca->gfilled = true; else grad = (grad * window) / (int)ca->tail; } /* Backoff was effectual: */ if (gmin <= -32 || gmax <= -32) ca->backoff_cnt = 0; if (use_tolerance) { /* Reduce small variations to zero: */ gmin = DIV_ROUND_CLOSEST(gmin, 64); gmax = DIV_ROUND_CLOSEST(gmax, 64); if (gmin > 0 && gmax <= 0) ca->state = CDG_FULL; else if ((gmin > 0 && gmax > 0) || gmax < 0) ca->state = CDG_NONFULL; } return grad; } static bool tcp_cdg_backoff(struct sock *sk, u32 grad) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); if (get_random_u32() <= nexp_u32(grad * backoff_factor)) return false; if (use_ineff) { ca->backoff_cnt++; if (ca->backoff_cnt > use_ineff) return false; } ca->shadow_wnd = max(ca->shadow_wnd, tcp_snd_cwnd(tp)); ca->state = CDG_BACKOFF; tcp_enter_cwr(sk); return true; } /* Not called in CWR or Recovery state. */ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); u32 prior_snd_cwnd; u32 incr; if (tcp_in_slow_start(tp) && hystart_detect) tcp_cdg_hystart_update(sk); if (after(ack, ca->rtt_seq) && ca->rtt.v64) { s32 grad = 0; if (ca->rtt_prev.v64) grad = tcp_cdg_grad(ca); ca->rtt_seq = tp->snd_nxt; ca->rtt_prev = ca->rtt; ca->rtt.v64 = 0; ca->last_ack = 0; ca->sample_cnt = 0; if (grad > 0 && tcp_cdg_backoff(sk, grad)) return; } if (!tcp_is_cwnd_limited(sk)) { ca->shadow_wnd = min(ca->shadow_wnd, tcp_snd_cwnd(tp)); return; } prior_snd_cwnd = tcp_snd_cwnd(tp); tcp_reno_cong_avoid(sk, ack, acked); incr = tcp_snd_cwnd(tp) - prior_snd_cwnd; ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); } static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); if (sample->rtt_us <= 0) return; /* A heuristic for filtering delayed ACKs, adapted from: * D.A. Hayes. "Timing enhancements to the FreeBSD kernel to support * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. */ if (tp->sacked_out == 0) { if (sample->pkts_acked == 1 && ca->delack) { /* A delayed ACK is only used for the minimum if it is * provenly lower than an existing non-zero minimum. */ ca->rtt.min = min(ca->rtt.min, sample->rtt_us); ca->delack--; return; } else if (sample->pkts_acked > 1 && ca->delack < 5) { ca->delack++; } } ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us); ca->rtt.max = max(ca->rtt.max, sample->rtt_us); } static u32 tcp_cdg_ssthresh(struct sock *sk) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); if (ca->state == CDG_BACKOFF) return max(2U, (tcp_snd_cwnd(tp) * min(1024U, backoff_beta)) >> 10); if (ca->state == CDG_NONFULL && use_tolerance) return tcp_snd_cwnd(tp); ca->shadow_wnd = min(ca->shadow_wnd >> 1, tcp_snd_cwnd(tp)); if (use_shadow) return max3(2U, ca->shadow_wnd, tcp_snd_cwnd(tp) >> 1); return max(2U, tcp_snd_cwnd(tp) >> 1); } static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); struct cdg_minmax *gradients; switch (ev) { case CA_EVENT_CWND_RESTART: gradients = ca->gradients; if (gradients) memset(gradients, 0, window * sizeof(gradients[0])); memset(ca, 0, sizeof(*ca)); ca->gradients = gradients; ca->rtt_seq = tp->snd_nxt; ca->shadow_wnd = tcp_snd_cwnd(tp); break; case CA_EVENT_COMPLETE_CWR: ca->state = CDG_UNKNOWN; ca->rtt_seq = tp->snd_nxt; ca->rtt_prev = ca->rtt; ca->rtt.v64 = 0; break; default: break; } } static void tcp_cdg_init(struct sock *sk) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->gradients = NULL; /* We silently fall back to window = 1 if allocation fails. */ if (window > 1) ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), GFP_NOWAIT | __GFP_NOWARN); ca->rtt_seq = tp->snd_nxt; ca->shadow_wnd = tcp_snd_cwnd(tp); } static void tcp_cdg_release(struct sock *sk) { struct cdg *ca = inet_csk_ca(sk); kfree(ca->gradients); ca->gradients = NULL; } static struct tcp_congestion_ops tcp_cdg __read_mostly = { .cong_avoid = tcp_cdg_cong_avoid, .cwnd_event = tcp_cdg_cwnd_event, .pkts_acked = tcp_cdg_acked, .undo_cwnd = tcp_reno_undo_cwnd, .ssthresh = tcp_cdg_ssthresh, .release = tcp_cdg_release, .init = tcp_cdg_init, .owner = THIS_MODULE, .name = "cdg", }; static int __init tcp_cdg_register(void) { if (backoff_beta > 1024 || window < 1 || window > 256) return -ERANGE; if (!is_power_of_2(window)) return -EINVAL; BUILD_BUG_ON(sizeof(struct cdg) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_cdg); return 0; } static void __exit tcp_cdg_unregister(void) { tcp_unregister_congestion_control(&tcp_cdg); } module_init(tcp_cdg_register); module_exit(tcp_cdg_unregister); MODULE_AUTHOR("Kenneth Klette Jonassen"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP CDG");
1 2 4 4 3 4 6 8 3 2 3 2 2 2 5 5 6 6 6 6 6 5 1 3 3 6 1 5 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" static inline bool devlink_rate_is_leaf(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; } static inline bool devlink_rate_is_node(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; } static struct devlink_rate * devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) { struct devlink_rate *devlink_rate; struct devlink_port *devlink_port; devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); if (IS_ERR(devlink_port)) return ERR_CAST(devlink_port); devlink_rate = devlink_port->devlink_rate; return devlink_rate ?: ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { static struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && !strcmp(node_name, devlink_rate->name)) return devlink_rate; } return ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { const char *rate_node_name; size_t len; if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return ERR_PTR(-EINVAL); rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); len = strlen(rate_node_name); /* Name cannot be empty or decimal number */ if (!len || strspn(rate_node_name, "0123456789") == len) return ERR_PTR(-EINVAL); return devlink_rate_node_get_by_name(devlink, rate_node_name); } static struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } static struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; if (attrs[DEVLINK_ATTR_PORT_INDEX]) return devlink_rate_leaf_get_from_info(devlink, info); else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return devlink_rate_node_get_from_info(devlink, info); else return ERR_PTR(-EINVAL); } static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_rate->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) goto nla_put_failure; if (devlink_rate_is_leaf(devlink_rate)) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_rate->devlink_port->index)) goto nla_put_failure; } else if (devlink_rate_is_node(devlink_rate)) { if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, devlink_rate->name)) goto nla_put_failure; } if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE, devlink_rate->tx_share, DEVLINK_ATTR_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX, devlink_rate->tx_max, DEVLINK_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, devlink_rate->tx_priority)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, devlink_rate->tx_weight)) goto nla_put_failure; if (devlink_rate->parent) if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, devlink_rate->parent->name)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { struct devlink *devlink = devlink_rate->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_rates_notify_register(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); } void devlink_rates_notify_unregister(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); } static int devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; int idx = 0; int err = 0; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; u32 id = NETLINK_CB(cb->skb).portid; if (idx < state->idx) { idx++; continue; } err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, cb->nlh->nlmsg_seq, flags, NULL); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one); } int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; struct sk_buff *msg; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static bool devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, struct devlink_rate *parent) { while (parent) { if (parent == devlink_rate) return true; parent = parent->parent; } return false; } static int devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, struct genl_info *info, struct nlattr *nla_parent) { struct devlink *devlink = devlink_rate->devlink; const char *parent_name = nla_data(nla_parent); const struct devlink_ops *ops = devlink->ops; size_t len = strlen(parent_name); struct devlink_rate *parent; int err = -EOPNOTSUPP; parent = devlink_rate->parent; if (parent && !len) { if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); if (err) return err; refcount_dec(&parent->refcnt); devlink_rate->parent = NULL; } else if (len) { parent = devlink_rate_node_get_by_name(devlink, parent_name); if (IS_ERR(parent)) return -ENODEV; if (parent == devlink_rate) { NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed"); return -EINVAL; } if (devlink_rate_is_node(devlink_rate) && devlink_rate_is_parent_node(devlink_rate, parent->parent)) { NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node."); return -EEXIST; } if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); if (err) return err; if (devlink_rate->parent) /* we're reassigning to other parent in this case */ refcount_dec(&devlink_rate->parent->refcnt); refcount_inc(&parent->refcnt); devlink_rate->parent = parent; } return 0; } static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) { struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; u32 priority; u32 weight; u64 rate; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_share = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_max = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); if (err) return err; devlink_rate->tx_priority = priority; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); if (err) return err; devlink_rate->tx_weight = weight; } nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, nla_parent); if (err) return err; } return 0; } static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, struct genl_info *info, enum devlink_rate_type type) { struct nlattr **attrs = info->attrs; if (type == DEVLINK_RATE_TYPE_LEAF) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_leaf_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the leafs"); return false; } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_node_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the nodes"); return false; } } else { WARN(1, "Unknown type of rate object"); return false; } return true; } int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; const struct devlink_ops *ops; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); ops = devlink->ops; if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) return -EOPNOTSUPP; err = devlink_nl_rate_set(devlink_rate, ops, info); if (!err) devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return err; } int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; const struct devlink_ops *ops; int err; ops = devlink->ops; if (!ops || !ops->rate_node_new || !ops->rate_node_del) { NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported"); return -EOPNOTSUPP; } if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) return -EOPNOTSUPP; rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); if (!IS_ERR(rate_node)) return -EEXIST; else if (rate_node == ERR_PTR(-EINVAL)) return -EINVAL; rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); if (!rate_node) return -ENOMEM; rate_node->devlink = devlink; rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); if (!rate_node->name) { err = -ENOMEM; goto err_strdup; } err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); if (err) goto err_node_new; err = devlink_nl_rate_set(rate_node, ops, info); if (err) goto err_rate_set; refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return 0; err_rate_set: ops->rate_node_del(rate_node, rate_node->priv, info->extack); err_node_new: kfree(rate_node->name); err_strdup: kfree(rate_node); return err; } int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; int err; rate_node = devlink_rate_node_get_from_info(devlink, info); if (IS_ERR(rate_node)) return PTR_ERR(rate_node); if (refcount_read(&rate_node->refcnt) > 1) { NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); return -EBUSY; } devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); err = devlink->ops->rate_node_del(rate_node, rate_node->priv, info->extack); if (rate_node->parent) refcount_dec(&rate_node->parent->refcnt); list_del(&rate_node->list); kfree(rate_node->name); kfree(rate_node); return err; } int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (devlink_rate_is_node(devlink_rate)) { NL_SET_ERR_MSG(extack, "Rate node(s) exists."); return -EBUSY; } return 0; } /** * devl_rate_node_create - create devlink rate node * @devlink: devlink instance * @priv: driver private data * @node_name: name of the resulting node * @parent: parent devlink_rate struct * * Create devlink rate object of type node */ struct devlink_rate * devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, struct devlink_rate *parent) { struct devlink_rate *rate_node; rate_node = devlink_rate_node_get_by_name(devlink, node_name); if (!IS_ERR(rate_node)) return ERR_PTR(-EEXIST); rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); if (!rate_node) return ERR_PTR(-ENOMEM); if (parent) { rate_node->parent = parent; refcount_inc(&rate_node->parent->refcnt); } rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->devlink = devlink; rate_node->priv = priv; rate_node->name = kstrdup(node_name, GFP_KERNEL); if (!rate_node->name) { kfree(rate_node); return ERR_PTR(-ENOMEM); } refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return rate_node; } EXPORT_SYMBOL_GPL(devl_rate_node_create); /** * devl_rate_leaf_create - create devlink rate leaf * @devlink_port: devlink port object to create rate object on * @priv: driver private data * @parent: parent devlink_rate struct * * Create devlink rate object of type leaf on provided @devlink_port. */ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, struct devlink_rate *parent) { struct devlink *devlink = devlink_port->devlink; struct devlink_rate *devlink_rate; devl_assert_locked(devlink_port->devlink); if (WARN_ON(devlink_port->devlink_rate)) return -EBUSY; devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); if (!devlink_rate) return -ENOMEM; if (parent) { devlink_rate->parent = parent; refcount_inc(&devlink_rate->parent->refcnt); } devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; devlink_rate->devlink = devlink; devlink_rate->devlink_port = devlink_port; devlink_rate->priv = priv; list_add_tail(&devlink_rate->list, &devlink->rate_list); devlink_port->devlink_rate = devlink_rate; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_rate_leaf_create); /** * devl_rate_leaf_destroy - destroy devlink rate leaf * * @devlink_port: devlink port linked to the rate object * * Destroy the devlink rate object of type leaf on provided @devlink_port. */ void devl_rate_leaf_destroy(struct devlink_port *devlink_port) { struct devlink_rate *devlink_rate = devlink_port->devlink_rate; devl_assert_locked(devlink_port->devlink); if (!devlink_rate) return; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); if (devlink_rate->parent) refcount_dec(&devlink_rate->parent->refcnt); list_del(&devlink_rate->list); devlink_port->devlink_rate = NULL; kfree(devlink_rate); } EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); /** * devl_rate_nodes_destroy - destroy all devlink rate nodes on device * @devlink: devlink instance * * Unset parent for all rate objects and destroy all rate nodes * on specified device. */ void devl_rate_nodes_destroy(struct devlink *devlink) { static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; devl_assert_locked(devlink); list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (!devlink_rate->parent) continue; refcount_dec(&devlink_rate->parent->refcnt); if (devlink_rate_is_leaf(devlink_rate)) ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); else if (devlink_rate_is_node(devlink_rate)) ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); list_del(&devlink_rate->list); kfree(devlink_rate->name); kfree(devlink_rate); } } } EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
1335 669 670 671 358 358 697 16 1836 1828 1334 357 697 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018, Intel Corporation. */ /* A common module to handle registrations and notifications for paravirtual * drivers to enable accelerated datapath and support VF live migration. * * The notifier and event handling code is based on netvsc driver. */ #include <linux/module.h> #include <linux/etherdevice.h> #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/failover.h> static LIST_HEAD(failover_list); static DEFINE_SPINLOCK(failover_lock); static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops) { struct net_device *failover_dev; struct failover *failover; spin_lock(&failover_lock); list_for_each_entry(failover, &failover_list, list) { failover_dev = rtnl_dereference(failover->failover_dev); if (ether_addr_equal(failover_dev->perm_addr, mac)) { *ops = rtnl_dereference(failover->ops); spin_unlock(&failover_lock); return failover_dev; } } spin_unlock(&failover_lock); return NULL; } /** * failover_slave_register - Register a slave netdev * * @slave_dev: slave netdev that is being registered * * Registers a slave device to a failover instance. Only ethernet devices * are supported. */ static int failover_slave_register(struct net_device *slave_dev) { struct netdev_lag_upper_info lag_upper_info; struct net_device *failover_dev; struct failover_ops *fops; int err; if (slave_dev->type != ARPHRD_ETHER) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_register && fops->slave_pre_register(slave_dev, failover_dev)) goto done; err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame, failover_dev); if (err) { netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n", err); goto done; } lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL, &lag_upper_info, NULL); if (err) { netdev_err(slave_dev, "can not set failover device %s (err = %d)\n", failover_dev->name, err); goto err_upper_link; } slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: return NOTIFY_DONE; } /** * failover_slave_unregister - Unregister a slave netdev * * @slave_dev: slave netdev that is being unregistered * * Unregisters a slave device from a failover instance. */ int failover_slave_unregister(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (fops && fops->slave_pre_unregister && fops->slave_pre_unregister(slave_dev, failover_dev)) goto done; netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } EXPORT_SYMBOL_GPL(failover_slave_unregister); static int failover_slave_link_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_link_change && !fops->slave_link_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_slave_name_change(struct net_device *slave_dev) { struct net_device *failover_dev; struct failover_ops *fops; if (!netif_is_failover_slave(slave_dev)) goto done; ASSERT_RTNL(); failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops); if (!failover_dev) goto done; if (!netif_running(failover_dev)) goto done; if (fops && fops->slave_name_change && !fops->slave_name_change(slave_dev, failover_dev)) return NOTIFY_OK; done: return NOTIFY_DONE; } static int failover_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); /* Skip parent events */ if (netif_is_failover(event_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: return failover_slave_register(event_dev); case NETDEV_UNREGISTER: return failover_slave_unregister(event_dev); case NETDEV_UP: case NETDEV_DOWN: case NETDEV_CHANGE: return failover_slave_link_change(event_dev); case NETDEV_CHANGENAME: return failover_slave_name_change(event_dev); default: return NOTIFY_DONE; } } static struct notifier_block failover_notifier = { .notifier_call = failover_event, }; static void failover_existing_slave_register(struct net_device *failover_dev) { struct net *net = dev_net(failover_dev); struct net_device *dev; rtnl_lock(); for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) failover_slave_register(dev); } rtnl_unlock(); } /** * failover_register - Register a failover instance * * @dev: failover netdev * @ops: failover ops * * Allocate and register a failover instance for a failover netdev. ops * provides handlers for slave device register/unregister/link change/ * name change events. * * Return: pointer to failover instance */ struct failover *failover_register(struct net_device *dev, struct failover_ops *ops) { struct failover *failover; if (dev->type != ARPHRD_ETHER) return ERR_PTR(-EINVAL); failover = kzalloc(sizeof(*failover), GFP_KERNEL); if (!failover) return ERR_PTR(-ENOMEM); rcu_assign_pointer(failover->ops, ops); netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL); dev->priv_flags |= IFF_FAILOVER; rcu_assign_pointer(failover->failover_dev, dev); spin_lock(&failover_lock); list_add_tail(&failover->list, &failover_list); spin_unlock(&failover_lock); netdev_info(dev, "failover master:%s registered\n", dev->name); failover_existing_slave_register(dev); return failover; } EXPORT_SYMBOL_GPL(failover_register); /** * failover_unregister - Unregister a failover instance * * @failover: pointer to failover instance * * Unregisters and frees a failover instance. */ void failover_unregister(struct failover *failover) { struct net_device *failover_dev; failover_dev = rcu_dereference(failover->failover_dev); netdev_info(failover_dev, "failover master:%s unregistered\n", failover_dev->name); failover_dev->priv_flags &= ~IFF_FAILOVER; netdev_put(failover_dev, &failover->dev_tracker); spin_lock(&failover_lock); list_del(&failover->list); spin_unlock(&failover_lock); kfree(failover); } EXPORT_SYMBOL_GPL(failover_unregister); static __init int failover_init(void) { register_netdevice_notifier(&failover_notifier); return 0; } module_init(failover_init); static __exit void failover_exit(void) { unregister_netdevice_notifier(&failover_notifier); } module_exit(failover_exit); MODULE_DESCRIPTION("Generic failover infrastructure/interface"); MODULE_LICENSE("GPL v2");
1 2 1 1 1 1 1 1 1 1 1 9 1 12 1 1 1 4 5 4 1 4 9 9 9 3 3 2 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_etf.c Earliest TxTime First queueing discipline. * * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> * Vinicius Costa Gomes <vinicius.gomes@intel.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/rbtree.h> #include <linux/skbuff.h> #include <linux/posix-timers.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/sock.h> #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) #define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) #define SKIP_SOCK_CHECK_IS_SET(x) ((x)->flags & TC_ETF_SKIP_SOCK_CHECK) struct etf_sched_data { bool offload; bool deadline_mode; bool skip_sock_check; int clockid; int queue; s32 delta; /* in ns */ ktime_t last; /* The txtime of the last skb sent to the netdevice. */ struct rb_root_cached head; struct qdisc_watchdog watchdog; ktime_t (*get_time)(void); }; static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, }; static inline int validate_input_params(struct tc_etf_qopt *qopt, struct netlink_ext_ack *extack) { /* Check if params comply to the following rules: * * Clockid and delta must be valid. * * * Dynamic clockids are not supported. * * * Delta must be a positive integer. * * Also note that for the HW offload case, we must * expect that system clocks have been synchronized to PHC. */ if (qopt->clockid < 0) { NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); return -ENOTSUPP; } if (qopt->clockid != CLOCK_TAI) { NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); return -EINVAL; } if (qopt->delta < 0) { NL_SET_ERR_MSG(extack, "Delta must be positive"); return -EINVAL; } return 0; } static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) { struct etf_sched_data *q = qdisc_priv(sch); ktime_t txtime = nskb->tstamp; struct sock *sk = nskb->sk; ktime_t now; if (q->skip_sock_check) goto skip; if (!sk || !sk_fullsock(sk)) return false; if (!sock_flag(sk, SOCK_TXTIME)) return false; /* We don't perform crosstimestamping. * Drop if packet's clockid differs from qdisc's. */ if (sk->sk_clockid != q->clockid) return false; if (sk->sk_txtime_deadline_mode != q->deadline_mode) return false; skip: now = q->get_time(); if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) return false; return true; } static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p; p = rb_first_cached(&q->head); if (!p) return NULL; return rb_to_skb(p); } static void reset_watchdog(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = etf_peek_timesortedlist(sch); ktime_t next; if (!skb) { qdisc_watchdog_cancel(&q->watchdog); return; } next = ktime_sub_ns(skb->tstamp, q->delta); qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); } static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) { struct sock_exterr_skb *serr; struct sk_buff *clone; ktime_t txtime = skb->tstamp; struct sock *sk = skb->sk; if (!sk || !sk_fullsock(sk) || !(sk->sk_txtime_report_errors)) return; clone = skb_clone(skb, GFP_ATOMIC); if (!clone) return; serr = SKB_EXT_ERR(clone); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; serr->ee.ee_type = 0; serr->ee.ee_code = code; serr->ee.ee_pad = 0; serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ serr->ee.ee_info = txtime; /* low part of tstamp */ if (sock_queue_err_skb(sk, clone)) kfree_skb(clone); } static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node **p = &q->head.rb_root.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; bool leftmost = true; if (!is_packet_valid(sch, nskb)) { report_sock_error(nskb, EINVAL, SO_EE_CODE_TXTIME_INVALID_PARAM); return qdisc_drop(nskb, sch, to_free); } while (*p) { struct sk_buff *skb; parent = *p; skb = rb_to_skb(parent); if (ktime_compare(txtime, skb->tstamp) >= 0) { p = &parent->rb_right; leftmost = false; } else { p = &parent->rb_left; } } rb_link_node(&nskb->rbnode, parent, p); rb_insert_color_cached(&nskb->rbnode, &q->head, leftmost); qdisc_qstats_backlog_inc(sch, nskb); sch->q.qlen++; /* Now we may need to re-arm the qdisc watchdog for the next packet. */ reset_watchdog(sch); return NET_XMIT_SUCCESS; } static void timesortedlist_drop(struct Qdisc *sch, struct sk_buff *skb, ktime_t now) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *to_free = NULL; struct sk_buff *tmp = NULL; skb_rbtree_walk_from_safe(skb, tmp) { if (ktime_after(skb->tstamp, now)) break; rb_erase_cached(&skb->rbnode, &q->head); /* The rbnode field in the skb re-uses these fields, now that * we are done with the rbnode, reset them. */ skb->next = NULL; skb->prev = NULL; skb->dev = qdisc_dev(sch); report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, &to_free); qdisc_qstats_overlimit(sch); sch->q.qlen--; } kfree_skb_list(to_free); } static void timesortedlist_remove(struct Qdisc *sch, struct sk_buff *skb) { struct etf_sched_data *q = qdisc_priv(sch); rb_erase_cached(&skb->rbnode, &q->head); /* The rbnode field in the skb re-uses these fields, now that * we are done with the rbnode, reset them. */ skb->next = NULL; skb->prev = NULL; skb->dev = qdisc_dev(sch); qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); q->last = skb->tstamp; sch->q.qlen--; } static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; ktime_t now, next; skb = etf_peek_timesortedlist(sch); if (!skb) return NULL; now = q->get_time(); /* Drop if packet has expired while in queue. */ if (ktime_before(skb->tstamp, now)) { timesortedlist_drop(sch, skb, now); skb = NULL; goto out; } /* When in deadline mode, dequeue as soon as possible and change the * txtime from deadline to (now + delta). */ if (q->deadline_mode) { timesortedlist_remove(sch, skb); skb->tstamp = now; goto out; } next = ktime_sub_ns(skb->tstamp, q->delta); /* Dequeue only if now is within the [txtime - delta, txtime] range. */ if (ktime_after(now, next)) timesortedlist_remove(sch, skb); else skb = NULL; out: /* Now we may need to re-arm the qdisc watchdog for the next packet. */ reset_watchdog(sch); return skb; } static void etf_disable_offload(struct net_device *dev, struct etf_sched_data *q) { struct tc_etf_qopt_offload etf = { }; const struct net_device_ops *ops; int err; if (!q->offload) return; ops = dev->netdev_ops; if (!ops->ndo_setup_tc) return; etf.queue = q->queue; etf.enable = 0; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); if (err < 0) pr_warn("Couldn't disable ETF offload for queue %d\n", etf.queue); } static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_etf_qopt_offload etf = { }; int err; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); return -EOPNOTSUPP; } etf.queue = q->queue; etf.enable = 1; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); if (err < 0) { NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); return err; } return 0; } static int etf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct etf_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct nlattr *tb[TCA_ETF_MAX + 1]; struct tc_etf_qopt *qopt; int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing ETF qdisc options which are mandatory"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_ETF_MAX, opt, etf_policy, extack); if (err < 0) return err; if (!tb[TCA_ETF_PARMS]) { NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); return -EINVAL; } qopt = nla_data(tb[TCA_ETF_PARMS]); pr_debug("delta %d clockid %d offload %s deadline %s\n", qopt->delta, qopt->clockid, OFFLOAD_IS_ON(qopt) ? "on" : "off", DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); err = validate_input_params(qopt, extack); if (err < 0) return err; q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); if (OFFLOAD_IS_ON(qopt)) { err = etf_enable_offload(dev, q, extack); if (err < 0) return err; } /* Everything went OK, save the parameters used. */ q->delta = qopt->delta; q->clockid = qopt->clockid; q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); q->skip_sock_check = SKIP_SOCK_CHECK_IS_SET(qopt); switch (q->clockid) { case CLOCK_REALTIME: q->get_time = ktime_get_real; break; case CLOCK_MONOTONIC: q->get_time = ktime_get; break; case CLOCK_BOOTTIME: q->get_time = ktime_get_boottime; break; case CLOCK_TAI: q->get_time = ktime_get_clocktai; break; default: NL_SET_ERR_MSG(extack, "Clockid is not supported"); return -ENOTSUPP; } qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); return 0; } static void timesortedlist_clear(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p = rb_first_cached(&q->head); while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); rb_erase_cached(&skb->rbnode, &q->head); rtnl_kfree_skbs(skb, skb); sch->q.qlen--; } } static void etf_reset(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); /* No matter which mode we are on, it's safe to clear both lists. */ timesortedlist_clear(sch); __qdisc_reset_queue(&sch->q); q->last = 0; } static void etf_destroy(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); etf_disable_offload(dev, q); } static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct etf_sched_data *q = qdisc_priv(sch); struct tc_etf_qopt opt = { }; struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; opt.delta = q->delta; opt.clockid = q->clockid; if (q->offload) opt.flags |= TC_ETF_OFFLOAD_ON; if (q->deadline_mode) opt.flags |= TC_ETF_DEADLINE_MODE_ON; if (q->skip_sock_check) opt.flags |= TC_ETF_SKIP_SOCK_CHECK; if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static struct Qdisc_ops etf_qdisc_ops __read_mostly = { .id = "etf", .priv_size = sizeof(struct etf_sched_data), .enqueue = etf_enqueue_timesortedlist, .dequeue = etf_dequeue_timesortedlist, .peek = etf_peek_timesortedlist, .init = etf_init, .reset = etf_reset, .destroy = etf_destroy, .dump = etf_dump, .owner = THIS_MODULE, }; static int __init etf_module_init(void) { return register_qdisc(&etf_qdisc_ops); } static void __exit etf_module_exit(void) { unregister_qdisc(&etf_qdisc_ops); } module_init(etf_module_init) module_exit(etf_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Earliest TxTime First (ETF) qdisc");
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BLOCKGROUP_LOCK_H #define _LINUX_BLOCKGROUP_LOCK_H /* * Per-blockgroup locking for ext2 and ext3. * * Simple hashed spinlocking. */ #include <linux/spinlock.h> #include <linux/cache.h> #ifdef CONFIG_SMP #define NR_BG_LOCKS (4 << ilog2(NR_CPUS < 32 ? NR_CPUS : 32)) #else #define NR_BG_LOCKS 1 #endif struct bgl_lock { spinlock_t lock; } ____cacheline_aligned_in_smp; struct blockgroup_lock { struct bgl_lock locks[NR_BG_LOCKS]; }; static inline void bgl_lock_init(struct blockgroup_lock *bgl) { int i; for (i = 0; i < NR_BG_LOCKS; i++) spin_lock_init(&bgl->locks[i].lock); } static inline spinlock_t * bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group) { return &bgl->locks[block_group & (NR_BG_LOCKS-1)].lock; } #endif
2 36 94 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLOCK_BLK_PM_H_ #define _BLOCK_BLK_PM_H_ #include <linux/pm_runtime.h> #ifdef CONFIG_PM static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { if (!q->dev || !blk_queue_pm_only(q)) return 1; /* Nothing to do */ if (pm && q->rpm_status != RPM_SUSPENDED) return 1; /* Request allowed */ pm_request_resume(q->dev); return 0; } static inline void blk_pm_mark_last_busy(struct request *rq) { if (rq->q->dev && !(rq->rq_flags & RQF_PM)) pm_runtime_mark_last_busy(rq->q->dev); } #else static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { return 1; } static inline void blk_pm_mark_last_busy(struct request *rq) { } #endif #endif /* _BLOCK_BLK_PM_H_ */
3 2 1 4 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <linux/fsnotify.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "sync.h" struct io_sync { struct file *file; loff_t len; loff_t off; int flags; int mode; }; int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); sync->flags = READ_ONCE(sqe->sync_range_flags); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); int ret; /* sync_file_range always requires a blocking context */ WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_set_res(req, ret, 0); return IOU_OK; } int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; sync->flags = READ_ONCE(sqe->fsync_flags); if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC)) return -EINVAL; sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_fsync(struct io_kiocb *req, unsigned int issue_flags) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); loff_t end = sync->off + sync->len; int ret; /* fsync always requires a blocking context */ WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); io_req_set_res(req, ret, 0); return IOU_OK; } int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->addr); sync->mode = READ_ONCE(sqe->len); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) { struct io_sync *sync = io_kiocb_to_cmd(req, struct io_sync); int ret; /* fallocate always requiring blocking context */ WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); if (ret >= 0) fsnotify_modify(req->file); io_req_set_res(req, ret, 0); return IOU_OK; }
1 2 21 17 3 2 3 14 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 // SPDX-License-Identifier: GPL-2.0-or-later /* * IP Payload Compression Protocol (IPComp) - RFC3173. * * Copyright (c) 2003 James Morris <jmorris@intercode.com.au> * * Todo: * - Tunable compression parameters. * - Compression stats. * - Adaptive compression. */ #include <linux/module.h> #include <linux/err.h> #include <linux/rtnetlink.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/icmp.h> #include <net/ipcomp.h> #include <net/protocol.h> #include <net/sock.h> static int ipcomp4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); __be32 spi; const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } spi = htonl(ntohs(ipch->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP); else ipv4_redirect(skb, net, 0, IPPROTO_COMP); xfrm_state_put(x); return 0; } /* We always hold one tunnel user reference to indicate a tunnel */ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); struct xfrm_state *t; t = xfrm_state_alloc(net); if (!t) goto out; t->id.proto = IPPROTO_IPIP; t->id.spi = x->props.saddr.a4; t->id.daddr.a4 = x->id.daddr.a4; memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET; t->props.mode = x->props.mode; t->props.saddr.a4 = x->props.saddr.a4; t->props.flags = x->props.flags; t->props.extra_flags = x->props.extra_flags; memcpy(&t->mark, &x->mark, sizeof(t->mark)); t->if_id = x->if_id; if (xfrm_init_state(t)) goto error; atomic_set(&t->tunnel_users, 1); out: return t; error: t->km.state = XFRM_STATE_DEAD; xfrm_state_put(t); t = NULL; goto out; } /* * Must be protected by xfrm_cfg_mutex. State and tunnel user references are * always incremented on success. */ static int ipcomp_tunnel_attach(struct xfrm_state *x) { struct net *net = xs_net(x); int err = 0; struct xfrm_state *t; u32 mark = x->mark.v & x->mark.m; t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4, x->props.saddr.a4, IPPROTO_IPIP, AF_INET); if (!t) { t = ipcomp_tunnel_create(x); if (!t) { err = -EINVAL; goto out; } xfrm_state_insert(t); xfrm_state_hold(t); } x->tunnel = t; atomic_inc(&t->tunnel_users); out: return err; } static int ipcomp4_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { int err = -EINVAL; x->props.header_len = 0; switch (x->props.mode) { case XFRM_MODE_TRANSPORT: break; case XFRM_MODE_TUNNEL: x->props.header_len += sizeof(struct iphdr); break; default: NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp"); goto out; } err = ipcomp_init_state(x, extack); if (err) goto out; if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp_tunnel_attach(x); if (err) { NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state"); goto out; } } err = 0; out: return err; } static int ipcomp4_rcv_cb(struct sk_buff *skb, int err) { return 0; } static const struct xfrm_type ipcomp_type = { .owner = THIS_MODULE, .proto = IPPROTO_COMP, .init_state = ipcomp4_init_state, .destructor = ipcomp_destroy, .input = ipcomp_input, .output = ipcomp_output }; static struct xfrm4_protocol ipcomp4_protocol = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = ipcomp4_rcv_cb, .err_handler = ipcomp4_err, .priority = 0, }; static int __init ipcomp4_init(void) { if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) { pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ipcomp_type, AF_INET); return -EAGAIN; } return 0; } static void __exit ipcomp4_fini(void) { if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); xfrm_unregister_type(&ipcomp_type, AF_INET); } module_init(ipcomp4_init); module_exit(ipcomp4_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP);
2 2 6 7 30 10 7 38 21 11 16 6 7 3 3 46 22 7 14 38 11 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 // SPDX-License-Identifier: GPL-2.0-only #include <linux/bitmap.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/export.h> #include <linux/hex.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include "kstrtox.h" /** * bitmap_parse_user - convert an ASCII hex string in a user buffer into a bitmap * * @ubuf: pointer to user buffer containing string. * @ulen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. */ int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { char *buf; int ret; buf = memdup_user_nul(ubuf, ulen); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_parse(buf, UINT_MAX, maskp, nmaskbits); kfree(buf); return ret; } EXPORT_SYMBOL(bitmap_parse_user); /** * bitmap_print_to_pagebuf - convert bitmap to list or hex format ASCII string * @list: indicates whether the bitmap must be list * @buf: page aligned buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * * Output format is a comma-separated list of decimal numbers and * ranges if list is specified or hex digits grouped into comma-separated * sets of 8 digits/set. Returns the number of characters written to buf. * * It is assumed that @buf is a pointer into a PAGE_SIZE, page-aligned * area and that sufficient storage remains at @buf to accommodate the * bitmap_print_to_pagebuf() output. Returns the number of characters * actually printed to @buf, excluding terminating '\0'. */ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits) { ptrdiff_t len = PAGE_SIZE - offset_in_page(buf); return list ? scnprintf(buf, len, "%*pbl\n", nmaskbits, maskp) : scnprintf(buf, len, "%*pb\n", nmaskbits, maskp); } EXPORT_SYMBOL(bitmap_print_to_pagebuf); /** * bitmap_print_to_buf - convert bitmap to list or hex format ASCII string * @list: indicates whether the bitmap must be list * true: print in decimal list format * false: print in hexadecimal bitmask format * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print */ static int bitmap_print_to_buf(bool list, char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { const char *fmt = list ? "%*pbl\n" : "%*pb\n"; ssize_t size; void *data; data = kasprintf(GFP_KERNEL, fmt, nmaskbits, maskp); if (!data) return -ENOMEM; size = memory_read_from_buffer(buf, count, &off, data, strlen(data) + 1); kfree(data); return size; } /** * bitmap_print_bitmask_to_buf - convert bitmap to hex bitmask format ASCII string * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print * * The bitmap_print_to_pagebuf() is used indirectly via its cpumap wrapper * cpumap_print_to_pagebuf() or directly by drivers to export hexadecimal * bitmask and decimal list to userspace by sysfs ABI. * Drivers might be using a normal attribute for this kind of ABIs. A * normal attribute typically has show entry as below:: * * static ssize_t example_attribute_show(struct device *dev, * struct device_attribute *attr, char *buf) * { * ... * return bitmap_print_to_pagebuf(true, buf, &mask, nr_trig_max); * } * * show entry of attribute has no offset and count parameters and this * means the file is limited to one page only. * bitmap_print_to_pagebuf() API works terribly well for this kind of * normal attribute with buf parameter and without offset, count:: * * bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, * int nmaskbits) * { * } * * The problem is once we have a large bitmap, we have a chance to get a * bitmask or list more than one page. Especially for list, it could be * as complex as 0,3,5,7,9,... We have no simple way to know it exact size. * It turns out bin_attribute is a way to break this limit. bin_attribute * has show entry as below:: * * static ssize_t * example_bin_attribute_show(struct file *filp, struct kobject *kobj, * struct bin_attribute *attr, char *buf, * loff_t offset, size_t count) * { * ... * } * * With the new offset and count parameters, this makes sysfs ABI be able * to support file size more than one page. For example, offset could be * >= 4096. * bitmap_print_bitmask_to_buf(), bitmap_print_list_to_buf() wit their * cpumap wrapper cpumap_print_bitmask_to_buf(), cpumap_print_list_to_buf() * make those drivers be able to support large bitmask and list after they * move to use bin_attribute. In result, we have to pass the corresponding * parameters such as off, count from bin_attribute show entry to this API. * * The role of cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf() * is similar with cpumap_print_to_pagebuf(), the difference is that * bitmap_print_to_pagebuf() mainly serves sysfs attribute with the assumption * the destination buffer is exactly one page and won't be more than one page. * cpumap_print_bitmask_to_buf() and cpumap_print_list_to_buf(), on the other * hand, mainly serves bin_attribute which doesn't work with exact one page, * and it can break the size limit of converted decimal list and hexadecimal * bitmask. * * WARNING! * * This function is not a replacement for sprintf() or bitmap_print_to_pagebuf(). * It is intended to workaround sysfs limitations discussed above and should be * used carefully in general case for the following reasons: * * - Time complexity is O(nbits^2/count), comparing to O(nbits) for snprintf(). * - Memory complexity is O(nbits), comparing to O(1) for snprintf(). * - @off and @count are NOT offset and number of bits to print. * - If printing part of bitmap as list, the resulting string is not a correct * list representation of bitmap. Particularly, some bits within or out of * related interval may be erroneously set or unset. The format of the string * may be broken, so bitmap_parselist-like parser may fail parsing it. * - If printing the whole bitmap as list by parts, user must ensure the order * of calls of the function such that the offset is incremented linearly. * - If printing the whole bitmap as list by parts, user must keep bitmap * unchanged between the very first and very last call. Otherwise concatenated * result may be incorrect, and format may be broken. * * Returns the number of characters actually printed to @buf */ int bitmap_print_bitmask_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { return bitmap_print_to_buf(false, buf, maskp, nmaskbits, off, count); } EXPORT_SYMBOL(bitmap_print_bitmask_to_buf); /** * bitmap_print_list_to_buf - convert bitmap to decimal list format ASCII string * @buf: buffer into which string is placed * @maskp: pointer to bitmap to convert * @nmaskbits: size of bitmap, in bits * @off: in the string from which we are copying, We copy to @buf * @count: the maximum number of bytes to print * * Everything is same with the above bitmap_print_bitmask_to_buf() except * the print format. */ int bitmap_print_list_to_buf(char *buf, const unsigned long *maskp, int nmaskbits, loff_t off, size_t count) { return bitmap_print_to_buf(true, buf, maskp, nmaskbits, off, count); } EXPORT_SYMBOL(bitmap_print_list_to_buf); /* * Region 9-38:4/10 describes the following bitmap structure: * 0 9 12 18 38 N * .........****......****......****.................. * ^ ^ ^ ^ ^ * start off group_len end nbits */ struct region { unsigned int start; unsigned int off; unsigned int group_len; unsigned int end; unsigned int nbits; }; static void bitmap_set_region(const struct region *r, unsigned long *bitmap) { unsigned int start; for (start = r->start; start <= r->end; start += r->group_len) bitmap_set(bitmap, start, min(r->end - start + 1, r->off)); } static int bitmap_check_region(const struct region *r) { if (r->start > r->end || r->group_len == 0 || r->off > r->group_len) return -EINVAL; if (r->end >= r->nbits) return -ERANGE; return 0; } static const char *bitmap_getnum(const char *str, unsigned int *num, unsigned int lastbit) { unsigned long long n; unsigned int len; if (str[0] == 'N') { *num = lastbit; return str + 1; } len = _parse_integer(str, 10, &n); if (!len) return ERR_PTR(-EINVAL); if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n) return ERR_PTR(-EOVERFLOW); *num = n; return str + len; } static inline bool end_of_str(char c) { return c == '\0' || c == '\n'; } static inline bool __end_of_region(char c) { return isspace(c) || c == ','; } static inline bool end_of_region(char c) { return __end_of_region(c) || end_of_str(c); } /* * The format allows commas and whitespaces at the beginning * of the region. */ static const char *bitmap_find_region(const char *str) { while (__end_of_region(*str)) str++; return end_of_str(*str) ? NULL : str; } static const char *bitmap_find_region_reverse(const char *start, const char *end) { while (start <= end && __end_of_region(*end)) end--; return end; } static const char *bitmap_parse_region(const char *str, struct region *r) { unsigned int lastbit = r->nbits - 1; if (!strncasecmp(str, "all", 3)) { r->start = 0; r->end = lastbit; str += 3; goto check_pattern; } str = bitmap_getnum(str, &r->start, lastbit); if (IS_ERR(str)) return str; if (end_of_region(*str)) goto no_end; if (*str != '-') return ERR_PTR(-EINVAL); str = bitmap_getnum(str + 1, &r->end, lastbit); if (IS_ERR(str)) return str; check_pattern: if (end_of_region(*str)) goto no_pattern; if (*str != ':') return ERR_PTR(-EINVAL); str = bitmap_getnum(str + 1, &r->off, lastbit); if (IS_ERR(str)) return str; if (*str != '/') return ERR_PTR(-EINVAL); return bitmap_getnum(str + 1, &r->group_len, lastbit); no_end: r->end = r->start; no_pattern: r->off = r->end + 1; r->group_len = r->end + 1; return end_of_str(*str) ? NULL : str; } /** * bitmap_parselist - convert list format ASCII string to bitmap * @buf: read user string from this buffer; must be terminated * with a \0 or \n. * @maskp: write resulting mask here * @nmaskbits: number of bits in mask to be written * * Input format is a comma-separated list of decimal numbers and * ranges. Consecutively set bits are shown as two hyphen-separated * decimal numbers, the smallest and largest bit numbers set in * the range. * Optionally each range can be postfixed to denote that only parts of it * should be set. The range will divided to groups of specific size. * From each group will be used only defined amount of bits. * Syntax: range:used_size/group_size * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769 * The value 'N' can be used as a dynamically substituted token for the * maximum allowed value; i.e (nmaskbits - 1). Keep in mind that it is * dynamic, so if system changes cause the bitmap width to change, such * as more cores in a CPU list, then any ranges using N will also change. * * Returns: 0 on success, -errno on invalid input strings. Error values: * * - ``-EINVAL``: wrong region format * - ``-EINVAL``: invalid character in string * - ``-ERANGE``: bit number specified too large for mask * - ``-EOVERFLOW``: integer overflow in the input parameters */ int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits) { struct region r; long ret; r.nbits = nmaskbits; bitmap_zero(maskp, r.nbits); while (buf) { buf = bitmap_find_region(buf); if (buf == NULL) return 0; buf = bitmap_parse_region(buf, &r); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_check_region(&r); if (ret) return ret; bitmap_set_region(&r, maskp); } return 0; } EXPORT_SYMBOL(bitmap_parselist); /** * bitmap_parselist_user() - convert user buffer's list format ASCII * string to bitmap * * @ubuf: pointer to user buffer containing string. * @ulen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. * * Wrapper for bitmap_parselist(), providing it with user buffer. */ int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *maskp, int nmaskbits) { char *buf; int ret; buf = memdup_user_nul(ubuf, ulen); if (IS_ERR(buf)) return PTR_ERR(buf); ret = bitmap_parselist(buf, maskp, nmaskbits); kfree(buf); return ret; } EXPORT_SYMBOL(bitmap_parselist_user); static const char *bitmap_get_x32_reverse(const char *start, const char *end, u32 *num) { u32 ret = 0; int c, i; for (i = 0; i < 32; i += 4) { c = hex_to_bin(*end--); if (c < 0) return ERR_PTR(-EINVAL); ret |= c << i; if (start > end || __end_of_region(*end)) goto out; } if (hex_to_bin(*end--) >= 0) return ERR_PTR(-EOVERFLOW); out: *num = ret; return end; } /** * bitmap_parse - convert an ASCII hex string into a bitmap. * @start: pointer to buffer containing string. * @buflen: buffer size in bytes. If string is smaller than this * then it must be terminated with a \0 or \n. In that case, * UINT_MAX may be provided instead of string length. * @maskp: pointer to bitmap array that will contain result. * @nmaskbits: size of bitmap, in bits. * * Commas group hex digits into chunks. Each chunk defines exactly 32 * bits of the resultant bitmask. No chunk may specify a value larger * than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value * then leading 0-bits are prepended. %-EINVAL is returned for illegal * characters. Grouping such as "1,,5", ",44", "," or "" is allowed. * Leading, embedded and trailing whitespace accepted. */ int bitmap_parse(const char *start, unsigned int buflen, unsigned long *maskp, int nmaskbits) { const char *end = strnchrnul(start, buflen, '\n') - 1; int chunks = BITS_TO_U32(nmaskbits); u32 *bitmap = (u32 *)maskp; int unset_bit; int chunk; for (chunk = 0; ; chunk++) { end = bitmap_find_region_reverse(start, end); if (start > end) break; if (!chunks--) return -EOVERFLOW; #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) end = bitmap_get_x32_reverse(start, end, &bitmap[chunk ^ 1]); #else end = bitmap_get_x32_reverse(start, end, &bitmap[chunk]); #endif if (IS_ERR(end)) return PTR_ERR(end); } unset_bit = (BITS_TO_U32(nmaskbits) - chunks) * 32; if (unset_bit < nmaskbits) { bitmap_clear(maskp, unset_bit, nmaskbits - unset_bit); return 0; } if (find_next_bit(maskp, unset_bit, nmaskbits) != unset_bit) return -EOVERFLOW; return 0; } EXPORT_SYMBOL(bitmap_parse);
377 377 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/cgroup-defs.h - basic definitions for cgroup * * This file provides basic type and interface. Include this file directly * only if necessary to avoid cyclic dependencies. */ #ifndef _LINUX_CGROUP_DEFS_H #define _LINUX_CGROUP_DEFS_H #include <linux/limits.h> #include <linux/list.h> #include <linux/idr.h> #include <linux/wait.h> #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> #include <linux/u64_stats_sync.h> #include <linux/workqueue.h> #include <linux/bpf-cgroup-defs.h> #include <linux/psi_types.h> #ifdef CONFIG_CGROUPS struct cgroup; struct cgroup_root; struct cgroup_subsys; struct cgroup_taskset; struct kernfs_node; struct kernfs_ops; struct kernfs_open_file; struct seq_file; struct poll_table_struct; #define MAX_CGROUP_TYPE_NAMELEN 32 #define MAX_CGROUP_ROOT_NAMELEN 64 #define MAX_CFTYPE_NAME 64 /* define the enumeration of all cgroup subsystems */ #define SUBSYS(_x) _x ## _cgrp_id, enum cgroup_subsys_id { #include <linux/cgroup_subsys.h> CGROUP_SUBSYS_COUNT, }; #undef SUBSYS /* bits in struct cgroup_subsys_state flags field */ enum { CSS_NO_REF = (1 << 0), /* no reference counting for this css */ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ CSS_VISIBLE = (1 << 3), /* css is visible to userland */ CSS_DYING = (1 << 4), /* css is dying */ }; /* bits in struct cgroup flags field */ enum { /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, /* * Clone the parent's configuration when creating a new child * cpuset cgroup. For historical reasons, this option can be * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, /* Control group has to be frozen. */ CGRP_FREEZE, /* Cgroup is frozen. */ CGRP_FROZEN, /* Control group has to be killed. */ CGRP_KILL, }; /* cgroup_root->flags */ enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ /* * Consider namespaces as delegation boundaries. If this flag is * set, controller specific interface files in a namespace root * aren't writeable from inside the namespace. */ CGRP_ROOT_NS_DELEGATE = (1 << 3), /* * Reduce latencies on dynamic cgroup modifications such as task * migrations and controller on/offs by disabling percpu operation on * cgroup_threadgroup_rwsem. This makes hot path operations such as * forks and exits into the slow path and more expensive. * * The static usage pattern of creating a cgroup, enabling controllers, * and then seeding it with CLONE_INTO_CGROUP doesn't require write * locking cgroup_threadgroup_rwsem and thus doesn't benefit from * favordynmod. */ CGRP_ROOT_FAVOR_DYNMODS = (1 << 4), /* * Enable cpuset controller in v1 cgroup to use v2 behavior. */ CGRP_ROOT_CPUSET_V2_MODE = (1 << 16), /* * Enable legacy local memory.events. */ CGRP_ROOT_MEMORY_LOCAL_EVENTS = (1 << 17), /* * Enable recursive subtree protection */ CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), /* * Enable hugetlb accounting for the memory controller. */ CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), }; /* cftype->flags */ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ __CFTYPE_ADDED = (1 << 18), }; /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can * be obtained by setting cftype->file_offset. */ struct cgroup_file { /* do not access any fields from outside cgroup core */ struct kernfs_node *kn; unsigned long notified_at; struct timer_list notify_timer; }; /* * Per-subsystem/per-cgroup state maintained by the system. This is the * fundamental structural building block that controllers deal with. * * Fields marked with "PI:" are public and immutable and may be accessed * directly without synchronization. */ struct cgroup_subsys_state { /* PI: the cgroup that this css is attached to */ struct cgroup *cgroup; /* PI: the cgroup subsystem that this css is attached to */ struct cgroup_subsys *ss; /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; /* siblings list anchored at the parent's ->children */ struct list_head sibling; struct list_head children; /* flush target list anchored at cgrp->rstat_css_list */ struct list_head rstat_css_node; /* * PI: Subsys-unique ID. 0 is unused and root is always 1. The * matching css can be looked up using css_from_id(). */ int id; unsigned int flags; /* * Monotonically increasing unique serial number which defines a * uniform order among all csses. It's guaranteed that all * ->children lists are in the ascending order of ->serial_nr and * used to allow interrupting and resuming iterations. */ u64 serial_nr; /* * Incremented by online self and children. Used to guarantee that * parents are not offlined before their children. */ atomic_t online_cnt; /* percpu_ref killing and RCU release */ struct work_struct destroy_work; struct rcu_work destroy_rwork; /* * PI: the parent css. Placed here for cache proximity to following * fields of the containing structure. */ struct cgroup_subsys_state *parent; }; /* * A css_set is a structure holding pointers to a set of * cgroup_subsys_state objects. This saves space in the task struct * object and speeds up fork()/exit(), since a single inc/dec and a * list_add()/del() can bump the reference count on the entire cgroup * set for a task. */ struct css_set { /* * Set of subsystem states, one for each subsystem. This array is * immutable after creation apart from the init_css_set during * subsystem registration (at boot time). */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; /* reference count */ refcount_t refcount; /* * For a domain cgroup, the following points to self. If threaded, * to the matching cset of the nearest domain ancestor. The * dom_cset provides access to the domain cgroup and its csses to * which domain level resource consumptions should be charged. */ struct css_set *dom_cset; /* the default cgroup associated with this css_set */ struct cgroup *dfl_cgrp; /* internal task count, protected by css_set_lock */ int nr_tasks; /* * Lists running through all tasks using this cgroup group. * mg_tasks lists tasks which belong to this cset but are in the * process of being migrated out or in. Protected by * css_set_lock, but, during migration, once tasks are moved to * mg_tasks, it can be read safely while holding cgroup_mutex. */ struct list_head tasks; struct list_head mg_tasks; struct list_head dying_tasks; /* all css_task_iters currently walking this cset */ struct list_head task_iters; /* * On the default hierarchy, ->subsys[ssid] may point to a css * attached to an ancestor instead of the cgroup this css_set is * associated with. The following node is anchored at * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to * iterate through all css's attached to a given cgroup. */ struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; /* all threaded csets whose ->dom_cset points to this cset */ struct list_head threaded_csets; struct list_head threaded_csets_node; /* * List running through all cgroup groups in the same hash * slot. Protected by css_set_lock */ struct hlist_node hlist; /* * List of cgrp_cset_links pointing at cgroups referenced from this * css_set. Protected by css_set_lock. */ struct list_head cgrp_links; /* * List of csets participating in the on-going migration either as * source or destination. Protected by cgroup_mutex. */ struct list_head mg_src_preload_node; struct list_head mg_dst_preload_node; struct list_head mg_node; /* * If this cset is acting as the source of migration the following * two fields are set. mg_src_cgrp and mg_dst_cgrp are * respectively the source and destination cgroups of the on-going * migration. mg_dst_cset is the destination cset the target tasks * on this cset should be migrated to. Protected by cgroup_mutex. */ struct cgroup *mg_src_cgrp; struct cgroup *mg_dst_cgrp; struct css_set *mg_dst_cset; /* dead and being drained, ignore for migration */ bool dead; /* For RCU-protected deletion */ struct rcu_head rcu_head; }; struct cgroup_base_stat { struct task_cputime cputime; #ifdef CONFIG_SCHED_CORE u64 forceidle_sum; #endif }; /* * rstat - cgroup scalable recursive statistics. Accounting is done * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the * hierarchy on reads. * * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are * linked into the updated tree. On the following read, propagation only * considers and consumes the updated tree. This makes reading O(the * number of descendants which have been active since last read) instead of * O(the total number of descendants). * * This is important because there can be a lot of (draining) cgroups which * aren't active and stat may be read frequently. The combination can * become very expensive. By propagating selectively, increasing reading * frequency decreases the cost of each read. * * This struct hosts both the fields which implement the above - * updated_children and updated_next - and the fields which track basic * resource statistics on top of it - bsync, bstat and last_bstat. */ struct cgroup_rstat_cpu { /* * ->bsync protects ->bstat. These are the only fields which get * updated in the hot path. */ struct u64_stats_sync bsync; struct cgroup_base_stat bstat; /* * Snapshots at the last reading. These are used to calculate the * deltas to propagate to the global counters. */ struct cgroup_base_stat last_bstat; /* * This field is used to record the cumulative per-cpu time of * the cgroup and its descendants. Currently it can be read via * eBPF/drgn etc, and we are still trying to determine how to * expose it in the cgroupfs interface. */ struct cgroup_base_stat subtree_bstat; /* * Snapshots at the last reading. These are used to calculate the * deltas to propagate to the per-cpu subtree_bstat. */ struct cgroup_base_stat last_subtree_bstat; /* * Child cgroups with stat updates on this cpu since the last read * are linked on the parent's ->updated_children through * ->updated_next. * * In addition to being more compact, singly-linked list pointing * to the cgroup makes it unnecessary for each per-cpu struct to * point back to the associated cgroup. * * Protected by per-cpu cgroup_rstat_cpu_lock. */ struct cgroup *updated_children; /* terminated by self cgroup */ struct cgroup *updated_next; /* NULL iff not on the list */ }; struct cgroup_freezer_state { /* Should the cgroup and its descendants be frozen. */ bool freeze; /* Should the cgroup actually be frozen? */ int e_freeze; /* Fields below are protected by css_set_lock */ /* Number of frozen descendant cgroups */ int nr_frozen_descendants; /* * Number of tasks, which are counted as frozen: * frozen, SIGSTOPped, and PTRACEd. */ int nr_frozen_tasks; }; struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; unsigned long flags; /* "unsigned long" so bitops work */ /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with * ancestors[] can determine whether a given cgroup is a * descendant of another without traversing the hierarchy. */ int level; /* Maximum allowed descent tree depth */ int max_depth; /* * Keep track of total numbers of visible and dying descent cgroups. * Dying cgroups are cgroups which were deleted by a user, * but are still existing because someone else is holding a reference. * max_descendants is a maximum allowed number of descent cgroups. * * nr_descendants and nr_dying_descendants are protected * by cgroup_mutex and css_set_lock. It's fine to read them holding * any of cgroup_mutex and css_set_lock; for writing both locks * should be held. */ int nr_descendants; int nr_dying_descendants; int max_descendants; /* * Each non-empty css_set associated with this cgroup contributes * one to nr_populated_csets. The counter is zero iff this cgroup * doesn't have any tasks. * * All children which have non-zero nr_populated_csets and/or * nr_populated_children of their own contribute one to either * nr_populated_domain_children or nr_populated_threaded_children * depending on their type. Each counter is zero iff all cgroups * of the type in the subtree proper don't have any tasks. */ int nr_populated_csets; int nr_populated_domain_children; int nr_populated_threaded_children; int nr_threaded_children; /* # of live threaded child cgroups */ struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ /* handles for "{cpu,memory,io,irq}.pressure" */ struct cgroup_file psi_files[NR_PSI_RESOURCES]; /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through * "cgroup.subtree_control" while ->subtree_ss_mask is the effective * one which may have more subsystems enabled. Controller knobs * are made available iff it's enabled in ->subtree_control. */ u16 subtree_control; u16 subtree_ss_mask; u16 old_subtree_control; u16 old_subtree_ss_mask; /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; struct cgroup_root *root; /* * List of cgrp_cset_links pointing at css_sets with tasks in this * cgroup. Protected by css_set_lock. */ struct list_head cset_links; /* * On the default hierarchy, a css_set for a cgroup with some * susbsys disabled will point to css's which are associated with * the closest ancestor which has the subsys enabled. The * following lists all css_sets which point to this cgroup's css * for the given subsystem. */ struct list_head e_csets[CGROUP_SUBSYS_COUNT]; /* * If !threaded, self. If threaded, it points to the nearest * domain ancestor. Inside a threaded subtree, cgroups are exempt * from process granularity and no-internal-task constraint. * Domain level resource consumptions which aren't tied to a * specific task are charged to the dom_cgrp. */ struct cgroup *dom_cgrp; struct cgroup *old_dom_cgrp; /* used while enabling threaded */ /* per-cpu recursive resource statistics */ struct cgroup_rstat_cpu __percpu *rstat_cpu; struct list_head rstat_css_list; /* * Add padding to separate the read mostly rstat_cpu and * rstat_css_list into a different cacheline from the following * rstat_flush_next and *bstat fields which can have frequent updates. */ CACHELINE_PADDING(_pad_); /* * A singly-linked list of cgroup structures to be rstat flushed. * This is a scratch field to be used exclusively by * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. */ struct cgroup *rstat_flush_next; /* cgroup basic resource statistics */ struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; struct prev_cputime prev_cputime; /* for printing out cputime */ /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. */ struct list_head pidlists; struct mutex pidlist_mutex; /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; /* used to schedule release agent */ struct work_struct release_agent_work; /* used to track pressure stalls */ struct psi_group *psi; /* used to store eBPF programs */ struct cgroup_bpf bpf; /* If there is block congestion on this cgroup. */ atomic_t congestion_count; /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *bpf_cgrp_storage; #endif /* All ancestors including self */ struct cgroup *ancestors[]; }; /* * A cgroup_root represents the root of a cgroup hierarchy, and may be * associated with a kernfs_root to form an active hierarchy. This is * internal to cgroup core. Don't access directly from controllers. */ struct cgroup_root { struct kernfs_root *kf_root; /* The bitmask of subsystems attached to this hierarchy */ unsigned int subsys_mask; /* Unique id for this hierarchy. */ int hierarchy_id; /* A list running through the active hierarchies */ struct list_head root_list; struct rcu_head rcu; /* Must be near the top */ /* * The root cgroup. The containing cgroup_root will be destroyed on its * release. cgrp->ancestors[0] will be used overflowing into the * following field. cgrp_ancestor_storage must immediately follow. */ struct cgroup cgrp; /* must follow cgrp for cgrp->ancestors[0], see above */ struct cgroup *cgrp_ancestor_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; /* Hierarchy-specific flags */ unsigned int flags; /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* * struct cftype: handler definitions for cgroup control files * * When reading/writing to a file: * - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata * - the 'cftype' of the file is file->f_path.dentry->d_fsdata */ struct cftype { /* * By convention, the name should begin with the name of the * subsystem, followed by a period. Zero length string indicates * end of cftype array. */ char name[MAX_CFTYPE_NAME]; unsigned long private; /* * The maximum length of string, excluding trailing nul, that can * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. */ size_t max_write_len; /* CFTYPE_* flags */ unsigned int flags; /* * If non-zero, should contain the offset from the start of css to * a struct cgroup_file field. cgroup will record the handle of * the created file into it. The recorded handle can be used as * long as the containing css remains accessible. */ unsigned int file_offset; /* * Fields used for internal bookkeeping. Initialized automatically * during registration. */ struct cgroup_subsys *ss; /* NULL for cgroup core files */ struct list_head node; /* anchored at ss->cfts */ struct kernfs_ops *kf_ops; int (*open)(struct kernfs_open_file *of); void (*release)(struct kernfs_open_file *of); /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() */ u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); /* * read_s64() is a signed version of read_u64() */ s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); /* generic seq_file read interface */ int (*seq_show)(struct seq_file *sf, void *v); /* optional ops, implement all or none */ void *(*seq_start)(struct seq_file *sf, loff_t *ppos); void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); void (*seq_stop)(struct seq_file *sf, void *v); /* * write_u64() is a shortcut for the common case of accepting * a single integer (as parsed by simple_strtoull) from * userspace. Use in place of write(); return 0 or error. */ int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, u64 val); /* * write_s64() is a signed version of write_u64() */ int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, s64 val); /* * write() is the generic write callback which maps directly to * kernfs write operation and overrides all other operations. * Maximum write size is determined by ->max_write_len. Use * of_css/cft() to access the associated css and cft. */ ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif }; /* * Control Group subsystem type. * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details */ struct cgroup_subsys { struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); int (*css_online)(struct cgroup_subsys_state *css); void (*css_offline)(struct cgroup_subsys_state *css); void (*css_released)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css); void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); int (*css_extra_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); int (*css_local_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); void (*release)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); bool early_init:1; /* * If %true, the controller, on the default hierarchy, doesn't show * up in "cgroup.controllers" or "cgroup.subtree_control", is * implicitly enabled on all cgroups on the default hierarchy, and * bypasses the "no internal process" constraint. This is for * utility type controllers which is transparent to userland. * * An implicit controller can be stolen from the default hierarchy * anytime and thus must be okay with offline csses from previous * hierarchies coexisting with csses for the current one. */ bool implicit_on_dfl:1; /* * If %true, the controller, supports threaded mode on the default * hierarchy. In a threaded subtree, both process granularity and * no-internal-process constraint are ignored and a threaded * controllers should be able to handle that. * * Note that as an implicit controller is automatically enabled on * all cgroups on the default hierarchy, it should also be * threaded. implicit && !threaded is not supported. */ bool threaded:1; /* the following two fields are initialized automatically during boot */ int id; const char *name; /* optional, initialized automatically during boot if not set */ const char *legacy_name; /* link to parent, protected by cgroup_lock() */ struct cgroup_root *root; /* idr for css->id */ struct idr css_idr; /* * List of cftypes. Each entry is the first entry of an array * terminated by zero length name. */ struct list_head cfts; /* * Base cftypes which are automatically registered. The two can * point to the same array. */ struct cftype *dfl_cftypes; /* for the default hierarchy */ struct cftype *legacy_cftypes; /* for the legacy hierarchies */ /* * A subsystem may depend on other subsystems. When such subsystem * is enabled on a cgroup, the depended-upon subsystems are enabled * together if available. Subsystems enabled due to dependency are * not visible to userland until explicitly enabled. The following * specifies the mask of subsystems that this one depends on. */ unsigned int depends_on; }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; /** * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups * @tsk: target task * * Allows cgroup operations to synchronize against threadgroup changes * using a percpu_rw_semaphore. */ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { percpu_down_read(&cgroup_threadgroup_rwsem); } /** * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups * @tsk: target task * * Counterpart of cgroup_threadcgroup_change_begin(). */ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) { percpu_up_read(&cgroup_threadgroup_rwsem); } #else /* CONFIG_CGROUPS */ #define CGROUP_SUBSYS_COUNT 0 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { might_sleep(); } static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} #endif /* CONFIG_CGROUPS */ #ifdef CONFIG_SOCK_CGROUP_DATA /* * sock_cgroup_data is embedded at sock->sk_cgrp_data and contains * per-socket cgroup information except for memcg association. * * On legacy hierarchies, net_prio and net_cls controllers directly * set attributes on each sock which can then be tested by the network * layer. On the default hierarchy, each sock is associated with the * cgroup it was created in and the networking layer can match the * cgroup directly. */ struct sock_cgroup_data { struct cgroup *cgroup; /* v2 */ #ifdef CONFIG_CGROUP_NET_CLASSID u32 classid; /* v1 */ #endif #ifdef CONFIG_CGROUP_NET_PRIO u16 prioidx; /* v1 */ #endif }; static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd) { #ifdef CONFIG_CGROUP_NET_PRIO return READ_ONCE(skcd->prioidx); #else return 1; #endif } static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) { #ifdef CONFIG_CGROUP_NET_CLASSID return READ_ONCE(skcd->classid); #else return 0; #endif } static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, u16 prioidx) { #ifdef CONFIG_CGROUP_NET_PRIO WRITE_ONCE(skcd->prioidx, prioidx); #endif } static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, u32 classid) { #ifdef CONFIG_CGROUP_NET_CLASSID WRITE_ONCE(skcd->classid, classid); #endif } #else /* CONFIG_SOCK_CGROUP_DATA */ struct sock_cgroup_data { }; #endif /* CONFIG_SOCK_CGROUP_DATA */ #endif /* _LINUX_CGROUP_DEFS_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 /* * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de> * * For licencing details see kernel-base/COPYING */ #include <linux/init.h> #include <linux/ioport.h> #include <linux/export.h> #include <linux/pci.h> #include <asm/acpi.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> #include <asm/pci_x86.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> #include <asm/e820/api.h> #include <asm/time.h> #include <asm/irq.h> #include <asm/io_apic.h> #include <asm/hpet.h> #include <asm/memtype.h> #include <asm/tsc.h> #include <asm/iommu.h> #include <asm/mach_traps.h> #include <asm/irqdomain.h> #include <asm/realmode.h> void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } void get_rtc_noop(struct timespec64 *now) { } static __initconst const struct of_device_id of_cmos_match[] = { { .compatible = "motorola,mc146818" }, {} }; /* * Allow devicetree configured systems to disable the RTC by setting the * corresponding DT node's status property to disabled. Code is optimized * out for CONFIG_OF=n builds. */ static __init void x86_wallclock_init(void) { struct device_node *node = of_find_matching_node(NULL, of_cmos_match); if (node && !of_device_is_available(node)) { x86_platform.get_wallclock = get_rtc_noop; x86_platform.set_wallclock = set_rtc_noop; } } /* * The platform setup functions are preset with the default functions * for standard PC hardware. */ struct x86_init_ops x86_init __initdata = { .resources = { .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = e820__memory_setup_default, }, .mpparse = { .setup_ioapic_ids = x86_init_noop, .find_smp_config = default_find_smp_config, .get_smp_config = default_get_smp_config, }, .irqs = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, .intr_mode_select = apic_intr_mode_select, .intr_mode_init = apic_intr_mode_init, .create_pci_msi_domain = native_create_pci_msi_domain, }, .oem = { .arch_setup = x86_init_noop, .banner = default_banner, }, .paging = { .pagetable_init = native_pagetable_init, }, .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, .timer_init = hpet_time_init, .wallclock_init = x86_wallclock_init, }, .iommu = { .iommu_init = iommu_init_noop, }, .pci = { .init = x86_default_pci_init, .init_irq = x86_default_pci_init_irq, .fixup_irqs = x86_default_pci_fixup_irqs, }, .hyper = { .init_platform = x86_init_noop, .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, .msi_ext_dest_id = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, .init_after_bootmem = x86_init_noop, }, .acpi = { .set_root_pointer = x86_default_set_root_pointer, .get_root_pointer = x86_default_get_root_pointer, .reduced_hw_early_init = acpi_generic_reduced_hw_init, }, }; struct x86_cpuinit_ops x86_cpuinit = { .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, .parallel_bringup = true, }; static void default_nmi_init(void) { }; static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; } static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; } static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_cache_flush_required_noop(void) { return false; } static bool is_private_mmio_noop(u64 addr) {return false; } struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_cmos_time, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, .save_sched_clock_state = tsc_save_sched_clock_state, .restore_sched_clock_state = tsc_restore_sched_clock_state, .realmode_reserve = reserve_real_mode, .realmode_init = init_real_mode, .hyper.pin_vcpu = x86_op_int_noop, .hyper.is_private_mmio = is_private_mmio_noop, .guest = { .enc_status_change_prepare = enc_status_change_prepare_noop, .enc_status_change_finish = enc_status_change_finish_noop, .enc_tlb_flush_required = enc_tlb_flush_required_noop, .enc_cache_flush_required = enc_cache_flush_required_noop, }, }; EXPORT_SYMBOL_GPL(x86_platform); struct x86_apic_ops x86_apic_ops __ro_after_init = { .io_apic_read = native_io_apic_read, .restore = native_restore_boot_irq_mode, };
5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 // SPDX-License-Identifier: GPL-2.0-only /* * vivid-kthread-out.h - video/vbi output thread support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/font.h> #include <linux/mutex.h> #include <linux/videodev2.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/random.h> #include <linux/v4l2-dv-timings.h> #include <linux/jiffies.h> #include <asm/div64.h> #include <media/videobuf2-vmalloc.h> #include <media/v4l2-dv-timings.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-fh.h> #include <media/v4l2-event.h> #include "vivid-core.h" #include "vivid-vid-common.h" #include "vivid-vid-cap.h" #include "vivid-vid-out.h" #include "vivid-radio-common.h" #include "vivid-radio-rx.h" #include "vivid-radio-tx.h" #include "vivid-sdr-cap.h" #include "vivid-vbi-cap.h" #include "vivid-vbi-out.h" #include "vivid-osd.h" #include "vivid-ctrls.h" #include "vivid-kthread-out.h" #include "vivid-meta-out.h" static void vivid_thread_vid_out_tick(struct vivid_dev *dev) { struct vivid_buffer *vid_out_buf = NULL; struct vivid_buffer *vbi_out_buf = NULL; struct vivid_buffer *meta_out_buf = NULL; dprintk(dev, 1, "Video Output Thread Tick\n"); /* Drop a certain percentage of buffers. */ if (dev->perc_dropped_buffers && get_random_u32_below(100) < dev->perc_dropped_buffers) return; spin_lock(&dev->slock); /* * Only dequeue buffer if there is at least one more pending. * This makes video loopback possible. */ if (!list_empty(&dev->vid_out_active) && !list_is_singular(&dev->vid_out_active)) { vid_out_buf = list_entry(dev->vid_out_active.next, struct vivid_buffer, list); list_del(&vid_out_buf->list); } if (!list_empty(&dev->vbi_out_active) && (dev->field_out != V4L2_FIELD_ALTERNATE || (dev->vbi_out_seq_count & 1))) { vbi_out_buf = list_entry(dev->vbi_out_active.next, struct vivid_buffer, list); list_del(&vbi_out_buf->list); } if (!list_empty(&dev->meta_out_active)) { meta_out_buf = list_entry(dev->meta_out_active.next, struct vivid_buffer, list); list_del(&meta_out_buf->list); } spin_unlock(&dev->slock); if (!vid_out_buf && !vbi_out_buf && !meta_out_buf) return; if (vid_out_buf) { v4l2_ctrl_request_setup(vid_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_out); v4l2_ctrl_request_complete(vid_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_out); vid_out_buf->vb.sequence = dev->vid_out_seq_count; if (dev->field_out == V4L2_FIELD_ALTERNATE) { /* * The sequence counter counts frames, not fields. * So divide by two. */ vid_out_buf->vb.sequence /= 2; } vid_out_buf->vb.vb2_buf.timestamp = ktime_get_ns() + dev->time_wrap_offset; vb2_buffer_done(&vid_out_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "vid_out buffer %d done\n", vid_out_buf->vb.vb2_buf.index); } if (vbi_out_buf) { v4l2_ctrl_request_setup(vbi_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_out); v4l2_ctrl_request_complete(vbi_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_out); if (dev->stream_sliced_vbi_out) vivid_sliced_vbi_out_process(dev, vbi_out_buf); vbi_out_buf->vb.sequence = dev->vbi_out_seq_count; vbi_out_buf->vb.vb2_buf.timestamp = ktime_get_ns() + dev->time_wrap_offset; vb2_buffer_done(&vbi_out_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "vbi_out buffer %d done\n", vbi_out_buf->vb.vb2_buf.index); } if (meta_out_buf) { v4l2_ctrl_request_setup(meta_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_out); v4l2_ctrl_request_complete(meta_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_out); vivid_meta_out_process(dev, meta_out_buf); meta_out_buf->vb.sequence = dev->meta_out_seq_count; meta_out_buf->vb.vb2_buf.timestamp = ktime_get_ns() + dev->time_wrap_offset; vb2_buffer_done(&meta_out_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "meta_out buffer %d done\n", meta_out_buf->vb.vb2_buf.index); } dev->dqbuf_error = false; } static int vivid_thread_vid_out(void *data) { struct vivid_dev *dev = data; u64 numerators_since_start; u64 buffers_since_start; u64 next_jiffies_since_start; unsigned long jiffies_since_start; unsigned long cur_jiffies; unsigned wait_jiffies; unsigned numerator; unsigned denominator; dprintk(dev, 1, "Video Output Thread Start\n"); set_freezable(); /* Resets frame counters */ dev->out_seq_offset = 0; dev->out_seq_count = 0; dev->jiffies_vid_out = jiffies; dev->out_seq_resync = false; if (dev->time_wrap) dev->time_wrap_offset = dev->time_wrap - ktime_get_ns(); else dev->time_wrap_offset = 0; for (;;) { try_to_freeze(); if (kthread_should_stop()) break; if (!mutex_trylock(&dev->mutex)) { schedule(); continue; } cur_jiffies = jiffies; if (dev->out_seq_resync) { dev->jiffies_vid_out = cur_jiffies; dev->out_seq_offset = dev->out_seq_count + 1; dev->out_seq_count = 0; dev->out_seq_resync = false; } numerator = dev->timeperframe_vid_out.numerator; denominator = dev->timeperframe_vid_out.denominator; if (dev->field_out == V4L2_FIELD_ALTERNATE) denominator *= 2; /* Calculate the number of jiffies since we started streaming */ jiffies_since_start = cur_jiffies - dev->jiffies_vid_out; /* Get the number of buffers streamed since the start */ buffers_since_start = (u64)jiffies_since_start * denominator + (HZ * numerator) / 2; do_div(buffers_since_start, HZ * numerator); /* * After more than 0xf0000000 (rounded down to a multiple of * 'jiffies-per-day' to ease jiffies_to_msecs calculation) * jiffies have passed since we started streaming reset the * counters and keep track of the sequence offset. */ if (jiffies_since_start > JIFFIES_RESYNC) { dev->jiffies_vid_out = cur_jiffies; dev->out_seq_offset = buffers_since_start; buffers_since_start = 0; } dev->out_seq_count = buffers_since_start + dev->out_seq_offset; dev->vid_out_seq_count = dev->out_seq_count - dev->vid_out_seq_start; dev->vbi_out_seq_count = dev->out_seq_count - dev->vbi_out_seq_start; dev->meta_out_seq_count = dev->out_seq_count - dev->meta_out_seq_start; vivid_thread_vid_out_tick(dev); mutex_unlock(&dev->mutex); /* * Calculate the number of 'numerators' streamed since we started, * not including the current buffer. */ numerators_since_start = buffers_since_start * numerator; /* And the number of jiffies since we started */ jiffies_since_start = jiffies - dev->jiffies_vid_out; /* Increase by the 'numerator' of one buffer */ numerators_since_start += numerator; /* * Calculate when that next buffer is supposed to start * in jiffies since we started streaming. */ next_jiffies_since_start = numerators_since_start * HZ + denominator / 2; do_div(next_jiffies_since_start, denominator); /* If it is in the past, then just schedule asap */ if (next_jiffies_since_start < jiffies_since_start) next_jiffies_since_start = jiffies_since_start; wait_jiffies = next_jiffies_since_start - jiffies_since_start; while (time_is_after_jiffies(cur_jiffies + wait_jiffies) && !kthread_should_stop()) schedule(); } dprintk(dev, 1, "Video Output Thread End\n"); return 0; } static void vivid_grab_controls(struct vivid_dev *dev, bool grab) { v4l2_ctrl_grab(dev->ctrl_has_crop_out, grab); v4l2_ctrl_grab(dev->ctrl_has_compose_out, grab); v4l2_ctrl_grab(dev->ctrl_has_scaler_out, grab); v4l2_ctrl_grab(dev->ctrl_tx_mode, grab); v4l2_ctrl_grab(dev->ctrl_tx_rgb_range, grab); } int vivid_start_generating_vid_out(struct vivid_dev *dev, bool *pstreaming) { dprintk(dev, 1, "%s\n", __func__); if (dev->kthread_vid_out) { u32 seq_count = dev->out_seq_count + dev->seq_wrap * 128; if (pstreaming == &dev->vid_out_streaming) dev->vid_out_seq_start = seq_count; else if (pstreaming == &dev->vbi_out_streaming) dev->vbi_out_seq_start = seq_count; else dev->meta_out_seq_start = seq_count; *pstreaming = true; return 0; } /* Resets frame counters */ dev->jiffies_vid_out = jiffies; dev->vid_out_seq_start = dev->seq_wrap * 128; dev->vbi_out_seq_start = dev->seq_wrap * 128; dev->meta_out_seq_start = dev->seq_wrap * 128; dev->kthread_vid_out = kthread_run(vivid_thread_vid_out, dev, "%s-vid-out", dev->v4l2_dev.name); if (IS_ERR(dev->kthread_vid_out)) { int err = PTR_ERR(dev->kthread_vid_out); dev->kthread_vid_out = NULL; v4l2_err(&dev->v4l2_dev, "kernel_thread() failed\n"); return err; } *pstreaming = true; vivid_grab_controls(dev, true); dprintk(dev, 1, "returning from %s\n", __func__); return 0; } void vivid_stop_generating_vid_out(struct vivid_dev *dev, bool *pstreaming) { dprintk(dev, 1, "%s\n", __func__); if (dev->kthread_vid_out == NULL) return; *pstreaming = false; if (pstreaming == &dev->vid_out_streaming) { /* Release all active buffers */ while (!list_empty(&dev->vid_out_active)) { struct vivid_buffer *buf; buf = list_entry(dev->vid_out_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_out); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "vid_out buffer %d done\n", buf->vb.vb2_buf.index); } } if (pstreaming == &dev->vbi_out_streaming) { while (!list_empty(&dev->vbi_out_active)) { struct vivid_buffer *buf; buf = list_entry(dev->vbi_out_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_out); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "vbi_out buffer %d done\n", buf->vb.vb2_buf.index); } } if (pstreaming == &dev->meta_out_streaming) { while (!list_empty(&dev->meta_out_active)) { struct vivid_buffer *buf; buf = list_entry(dev->meta_out_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_out); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "meta_out buffer %d done\n", buf->vb.vb2_buf.index); } } if (dev->vid_out_streaming || dev->vbi_out_streaming || dev->meta_out_streaming) return; /* shutdown control thread */ vivid_grab_controls(dev, false); kthread_stop(dev->kthread_vid_out); dev->kthread_vid_out = NULL; }
5 1 1 1 1 1 5 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 /* CMTP implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2002-2003 Marcel Holtmann <marcel@holtmann.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/export.h> #include <linux/types.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/poll.h> #include <linux/fcntl.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/ioctl.h> #include <linux/file.h> #include <linux/compat.h> #include <linux/gfp.h> #include <linux/uaccess.h> #include <net/sock.h> #include <linux/isdn/capilli.h> #include "cmtp.h" static struct bt_sock_list cmtp_sk_list = { .lock = __RW_LOCK_UNLOCKED(cmtp_sk_list.lock) }; static int cmtp_sock_release(struct socket *sock) { struct sock *sk = sock->sk; BT_DBG("sock %p sk %p", sock, sk); if (!sk) return 0; bt_sock_unlink(&cmtp_sk_list, sk); sock_orphan(sk); sock_put(sk); return 0; } static int do_cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp) { struct cmtp_connadd_req ca; struct cmtp_conndel_req cd; struct cmtp_connlist_req cl; struct cmtp_conninfo ci; struct socket *nsock; int err; BT_DBG("cmd %x arg %p", cmd, argp); switch (cmd) { case CMTPCONNADD: if (!capable(CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ca, argp, sizeof(ca))) return -EFAULT; nsock = sockfd_lookup(ca.sock, &err); if (!nsock) return err; if (nsock->sk->sk_state != BT_CONNECTED) { sockfd_put(nsock); return -EBADFD; } err = cmtp_add_connection(&ca, nsock); if (!err) { if (copy_to_user(argp, &ca, sizeof(ca))) err = -EFAULT; } else sockfd_put(nsock); return err; case CMTPCONNDEL: if (!capable(CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; return cmtp_del_connection(&cd); case CMTPGETCONNLIST: if (copy_from_user(&cl, argp, sizeof(cl))) return -EFAULT; if (cl.cnum <= 0) return -EINVAL; err = cmtp_get_connlist(&cl); if (!err && copy_to_user(argp, &cl, sizeof(cl))) return -EFAULT; return err; case CMTPGETCONNINFO: if (copy_from_user(&ci, argp, sizeof(ci))) return -EFAULT; err = cmtp_get_conninfo(&ci); if (!err && copy_to_user(argp, &ci, sizeof(ci))) return -EFAULT; return err; } return -EINVAL; } static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return do_cmtp_sock_ioctl(sock, cmd, (void __user *)arg); } #ifdef CONFIG_COMPAT static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); if (cmd == CMTPGETCONNLIST) { struct cmtp_connlist_req cl; u32 __user *p = argp; u32 uci; int err; if (get_user(cl.cnum, p) || get_user(uci, p + 1)) return -EFAULT; cl.ci = compat_ptr(uci); if (cl.cnum <= 0) return -EINVAL; err = cmtp_get_connlist(&cl); if (!err && put_user(cl.cnum, p)) err = -EFAULT; return err; } return do_cmtp_sock_ioctl(sock, cmd, argp); } #endif static const struct proto_ops cmtp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = cmtp_sock_release, .ioctl = cmtp_sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = cmtp_sock_compat_ioctl, #endif .bind = sock_no_bind, .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static struct proto cmtp_proto = { .name = "CMTP", .owner = THIS_MODULE, .obj_size = sizeof(struct bt_sock) }; static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &cmtp_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = BT_OPEN; bt_sock_link(&cmtp_sk_list, sk); return 0; } static const struct net_proto_family cmtp_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = cmtp_sock_create }; int cmtp_init_sockets(void) { int err; err = proto_register(&cmtp_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_CMTP, &cmtp_sock_family_ops); if (err < 0) { BT_ERR("Can't register CMTP socket"); goto error; } err = bt_procfs_init(&init_net, "cmtp", &cmtp_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create CMTP proc file"); bt_sock_unregister(BTPROTO_HIDP); goto error; } BT_INFO("CMTP socket layer initialized"); return 0; error: proto_unregister(&cmtp_proto); return err; } void cmtp_cleanup_sockets(void) { bt_procfs_cleanup(&init_net, "cmtp"); bt_sock_unregister(BTPROTO_CMTP); proto_unregister(&cmtp_proto); }
46 123 152 152 152 594 518 150 510 183 503 32 473 502 69 69 69 69 67 2 1 2 1 10 1 3 7 1 2 10 2 1 4 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "tctx.h" static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, struct task_struct *task) { struct io_wq_hash *hash; struct io_wq_data data; unsigned int concurrency; mutex_lock(&ctx->uring_lock); hash = ctx->hash_map; if (!hash) { hash = kzalloc(sizeof(*hash), GFP_KERNEL); if (!hash) { mutex_unlock(&ctx->uring_lock); return ERR_PTR(-ENOMEM); } refcount_set(&hash->refs, 1); init_waitqueue_head(&hash->wait); ctx->hash_map = hash; } mutex_unlock(&ctx->uring_lock); data.hash = hash; data.task = task; data.free_work = io_wq_free_work; data.do_work = io_wq_submit_work; /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); return io_wq_create(concurrency, &data); } void __io_uring_free(struct task_struct *tsk) { struct io_uring_task *tctx = tsk->io_uring; WARN_ON_ONCE(!xa_empty(&tctx->xa)); WARN_ON_ONCE(tctx->io_wq); WARN_ON_ONCE(tctx->cached_refs); percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; } __cold int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); if (unlikely(!tctx)) return -ENOMEM; ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); if (unlikely(ret)) { kfree(tctx); return ret; } tctx->io_wq = io_init_wq_offload(ctx, task); if (IS_ERR(tctx->io_wq)) { ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); kfree(tctx); return ret; } xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_cancel, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; init_llist_head(&tctx->task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; int ret; if (unlikely(!tctx)) { ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) return ret; tctx = current->io_uring; if (ctx->iowq_limits_set) { unsigned int limits[2] = { ctx->iowq_limits[0], ctx->iowq_limits[1], }; ret = io_wq_max_workers(tctx->io_wq, limits); if (ret) return ret; } } if (!xa_load(&tctx->xa, (unsigned long)ctx)) { node = kmalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->ctx = ctx; node->task = current; ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, node, GFP_KERNEL)); if (ret) { kfree(node); return ret; } mutex_lock(&ctx->uring_lock); list_add(&node->ctx_node, &ctx->tctx_list); mutex_unlock(&ctx->uring_lock); } return 0; } int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx) { int ret; if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && ctx->submitter_task != current) return -EEXIST; ret = __io_uring_add_tctx_node(ctx); if (ret) return ret; current->io_uring->last = ctx; return 0; } /* * Remove this io_uring_file -> task mapping. */ __cold void io_uring_del_tctx_node(unsigned long index) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; if (!tctx) return; node = xa_erase(&tctx->xa, index); if (!node) return; WARN_ON_ONCE(current != node->task); WARN_ON_ONCE(list_empty(&node->ctx_node)); mutex_lock(&node->ctx->uring_lock); list_del(&node->ctx_node); mutex_unlock(&node->ctx->uring_lock); if (tctx->last == node->ctx) tctx->last = NULL; kfree(node); } __cold void io_uring_clean_tctx(struct io_uring_task *tctx) { struct io_wq *wq = tctx->io_wq; struct io_tctx_node *node; unsigned long index; xa_for_each(&tctx->xa, index, node) { io_uring_del_tctx_node(index); cond_resched(); } if (wq) { /* * Must be after io_uring_del_tctx_node() (removes nodes under * uring_lock) to avoid race with io_uring_try_cancel_iowq(). */ io_wq_put_and_exit(wq); tctx->io_wq = NULL; } } void io_uring_unreg_ringfd(void) { struct io_uring_task *tctx = current->io_uring; int i; for (i = 0; i < IO_RINGFD_REG_MAX; i++) { if (tctx->registered_rings[i]) { fput(tctx->registered_rings[i]); tctx->registered_rings[i] = NULL; } } } int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end) { int offset; for (offset = start; offset < end; offset++) { offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); if (tctx->registered_rings[offset]) continue; tctx->registered_rings[offset] = file; return offset; } return -EBUSY; } static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, int start, int end) { struct file *file; int offset; file = fget(fd); if (!file) { return -EBADF; } else if (!io_is_uring_fops(file)) { fput(file); return -EOPNOTSUPP; } offset = io_ring_add_registered_file(tctx, file, start, end); if (offset < 0) fput(file); return offset; } /* * Register a ring fd to avoid fdget/fdput for each io_uring_enter() * invocation. User passes in an array of struct io_uring_rsrc_update * with ->data set to the ring_fd, and ->offset given for the desired * index. If no index is desired, application may set ->offset == -1U * and we'll find an available index. Returns number of entries * successfully processed, or < 0 on error if none were processed. */ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args) { struct io_uring_rsrc_update __user *arg = __arg; struct io_uring_rsrc_update reg; struct io_uring_task *tctx; int ret, i; if (!nr_args || nr_args > IO_RINGFD_REG_MAX) return -EINVAL; mutex_unlock(&ctx->uring_lock); ret = __io_uring_add_tctx_node(ctx); mutex_lock(&ctx->uring_lock); if (ret) return ret; tctx = current->io_uring; for (i = 0; i < nr_args; i++) { int start, end; if (copy_from_user(&reg, &arg[i], sizeof(reg))) { ret = -EFAULT; break; } if (reg.resv) { ret = -EINVAL; break; } if (reg.offset == -1U) { start = 0; end = IO_RINGFD_REG_MAX; } else { if (reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break; } start = reg.offset; end = start + 1; } ret = io_ring_add_registered_fd(tctx, reg.data, start, end); if (ret < 0) break; reg.offset = ret; if (copy_to_user(&arg[i], &reg, sizeof(reg))) { fput(tctx->registered_rings[reg.offset]); tctx->registered_rings[reg.offset] = NULL; ret = -EFAULT; break; } } return i ? i : ret; } int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args) { struct io_uring_rsrc_update __user *arg = __arg; struct io_uring_task *tctx = current->io_uring; struct io_uring_rsrc_update reg; int ret = 0, i; if (!nr_args || nr_args > IO_RINGFD_REG_MAX) return -EINVAL; if (!tctx) return 0; for (i = 0; i < nr_args; i++) { if (copy_from_user(&reg, &arg[i], sizeof(reg))) { ret = -EFAULT; break; } if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break; } reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); if (tctx->registered_rings[reg.offset]) { fput(tctx->registered_rings[reg.offset]); tctx->registered_rings[reg.offset] = NULL; } } return i ? i : ret; }
40 40 627 627 3 3 962 962 159 160 16 16 1425 29 1410 136 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 /* * Compatibility functions which bloat the callers too much to make inline. * All of the callers of these functions should be converted to use folios * eventually. */ #include <linux/migrate.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include "internal.h" struct address_space *page_mapping(struct page *page) { return folio_mapping(page_folio(page)); } EXPORT_SYMBOL(page_mapping); void unlock_page(struct page *page) { return folio_unlock(page_folio(page)); } EXPORT_SYMBOL(unlock_page); void end_page_writeback(struct page *page) { return folio_end_writeback(page_folio(page)); } EXPORT_SYMBOL(end_page_writeback); void wait_on_page_writeback(struct page *page) { return folio_wait_writeback(page_folio(page)); } EXPORT_SYMBOL_GPL(wait_on_page_writeback); void wait_for_stable_page(struct page *page) { return folio_wait_stable(page_folio(page)); } EXPORT_SYMBOL_GPL(wait_for_stable_page); void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); } EXPORT_SYMBOL(mark_page_accessed); void set_page_writeback(struct page *page) { folio_start_writeback(page_folio(page)); } EXPORT_SYMBOL(set_page_writeback); bool set_page_dirty(struct page *page) { return folio_mark_dirty(page_folio(page)); } EXPORT_SYMBOL(set_page_dirty); int __set_page_dirty_nobuffers(struct page *page) { return filemap_dirty_folio(page_mapping(page), page_folio(page)); } EXPORT_SYMBOL(__set_page_dirty_nobuffers); bool clear_page_dirty_for_io(struct page *page) { return folio_clear_dirty_for_io(page_folio(page)); } EXPORT_SYMBOL(clear_page_dirty_for_io); bool redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { return folio_redirty_for_writepage(wbc, page_folio(page)); } EXPORT_SYMBOL(redirty_page_for_writepage); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp) { return filemap_add_folio(mapping, page_folio(page), index, gfp); } EXPORT_SYMBOL(add_to_page_cache_lru); noinline struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp) { struct folio *folio; folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); if (IS_ERR(folio)) return NULL; return folio_file_page(folio, index); } EXPORT_SYMBOL(pagecache_get_page); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(grab_cache_page_write_begin); bool isolate_lru_page(struct page *page) { if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) return false; return folio_isolate_lru((struct folio *)page); } void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); }
6 1 14 1 8 9 7 7 2 2 7 1 1 3 3 3 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/minix/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include "minix.h" static int add_nondir(struct dentry *dentry, struct inode *inode) { int err = minix_add_link(dentry, inode); if (!err) { d_instantiate(dentry, inode); return 0; } inode_dec_link_count(inode); iput(inode); return err; } static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags) { struct inode * inode = NULL; ino_t ino; if (dentry->d_name.len > minix_sb(dir->i_sb)->s_namelen) return ERR_PTR(-ENAMETOOLONG); ino = minix_inode_by_name(dentry); if (ino) inode = minix_iget(dir->i_sb, ino); return d_splice_alias(inode, dentry); } static int minix_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; if (!old_valid_dev(rdev)) return -EINVAL; inode = minix_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); minix_set_inode(inode, rdev); mark_inode_dirty(inode); return add_nondir(dentry, inode); } static int minix_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode = minix_new_inode(dir, mode); if (IS_ERR(inode)) return finish_open_simple(file, PTR_ERR(inode)); minix_set_inode(inode, 0); mark_inode_dirty(inode); d_tmpfile(file, inode); return finish_open_simple(file, 0); } static int minix_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return minix_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } static int minix_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int i = strlen(symname)+1; struct inode * inode; int err; if (i > dir->i_sb->s_blocksize) return -ENAMETOOLONG; inode = minix_new_inode(dir, S_IFLNK | 0777); if (IS_ERR(inode)) return PTR_ERR(inode); minix_set_inode(inode, 0); err = page_symlink(inode, symname, i); if (unlikely(err)) { inode_dec_link_count(inode); iput(inode); return err; } return add_nondir(dentry, inode); } static int minix_link(struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); return add_nondir(dentry, inode); } static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode * inode; int err; inode = minix_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode_inc_link_count(dir); minix_set_inode(inode, 0); inode_inc_link_count(inode); err = minix_make_empty(inode, dir); if (err) goto out_fail; err = minix_add_link(dentry, inode); if (err) goto out_fail; d_instantiate(dentry, inode); out: return err; out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); iput(inode); inode_dec_link_count(dir); goto out; } static int minix_unlink(struct inode * dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); struct page * page; struct minix_dir_entry * de; int err; de = minix_find_entry(dentry, &page); if (!de) return -ENOENT; err = minix_delete_entry(de, page); unmap_and_put_page(page, de); if (err) return err; inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); return 0; } static int minix_rmdir(struct inode * dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); int err = -ENOTEMPTY; if (minix_empty_dir(inode)) { err = minix_unlink(dir, dentry); if (!err) { inode_dec_link_count(dir); inode_dec_link_count(inode); } } return err; } static int minix_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct minix_dir_entry * dir_de = NULL; struct page * old_page; struct minix_dir_entry * old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; old_de = minix_find_entry(old_dentry, &old_page); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; dir_de = minix_dotdot(old_inode, &dir_page); if (!dir_de) goto out_old; } if (new_inode) { struct page * new_page; struct minix_dir_entry * new_de; err = -ENOTEMPTY; if (dir_de && !minix_empty_dir(new_inode)) goto out_dir; err = -ENOENT; new_de = minix_find_entry(new_dentry, &new_page); if (!new_de) goto out_dir; err = minix_set_link(new_de, new_page, old_inode); kunmap(new_page); put_page(new_page); if (err) goto out_dir; inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); } else { err = minix_add_link(new_dentry, old_inode); if (err) goto out_dir; if (dir_de) inode_inc_link_count(new_dir); } err = minix_delete_entry(old_de, old_page); if (err) goto out_dir; mark_inode_dirty(old_inode); if (dir_de) { err = minix_set_link(dir_de, dir_page, new_dir); if (!err) inode_dec_link_count(old_dir); } out_dir: if (dir_de) unmap_and_put_page(dir_page, dir_de); out_old: unmap_and_put_page(old_page, old_de); out: return err; } /* * directories can handle most operations... */ const struct inode_operations minix_dir_inode_operations = { .create = minix_create, .lookup = minix_lookup, .link = minix_link, .unlink = minix_unlink, .symlink = minix_symlink, .mkdir = minix_mkdir, .rmdir = minix_rmdir, .mknod = minix_mknod, .rename = minix_rename, .getattr = minix_getattr, .tmpfile = minix_tmpfile, };
378 380 4 8 2 5 11 1 1 197 180 13 6 7 1 12 51 44 65 66 15 51 47 4 2 51 38 4 4 15 8 53 39 24 1 7 7 1 1 1 1 1 1 1 334 336 335 335 4 335 253 90 185 145 70 325 15 323 324 322 1 2 11 8 18 17 18 2 4 1 4 1 2 8 4 12 550 248 189 102 284 200 238 30 365 224 219 8 3 11 2 13 2 20 319 107 181 224 237 208 30 3 232 225 8 35 202 485 23 19 1 18 2 431 18 213 315 14 198 376 352 51 43 319 20 12 332 12 335 335 5 246 303 224 306 306 175 306 295 12 283 285 285 284 2 285 285 78 285 6 10 11 2 2 3 2 13 13 13 13 212 308 140 13 199 134 117 132 8 41 161 31 193 229 14 100 187 6 192 2 485 2 2 5 4 2 1 1 1 5 5 5 219 82 9 6 37 33 205 78 100 130 219 127 31 17 24 25 3 18 1 9 6 11 1 9 1 8 1 11 3 2 3 7 2 5 2 3 2 17 3 145 145 9 1 8 8 2 11 2 6 8 5 10 8 3 2 1 2 2 1 3 3 4 10 4 6 6 3 3 4 4 10 1 38 38 4 1 32 6 17 11 1 10 2 1 10 8 2 10 8 6 1 2 10 10 5 2 4 76 77 68 2 1 4 2 5 72 77 77 1 4 34 39 39 207 1 199 7 206 3 1 185 187 188 5 6 173 135 3 5 172 90 1 6 9 59 2 54 4 105 105 106 23 5 4 31 100 4 131 4 3 128 45 2 5 113 2 3 136 145 9 5 209 9 200 1 98 5 58 58 5 4 484 483 403 200 53 154 454 51 8 5 40 6 5 112 7 105 74 3 22 1 14 8 112 112 110 1 1 102 6 6 6 104 97 97 519 123 518 517 4 519 124 85 25 20 5 6 85 16 125 124 124 1 124 124 125 3 2 8 2 1 5 1 1 1 2 12 11 11 6 1 1 3 1 4 2 2 26 3 23 26 5 21 6 2 3 3 1 2 3 3 3 4 4 1 2 1 3 4 3 4 4 1 1 1 1 304 2 46 2 2 17 2 15 10 8 225 6 7 7 7 2 3 2 12 51 2 5 2 2 2 2 1 1 20 2 2 1 1 2 2 1 1 1 2 1 12 5 4 1 2 4 6 2 2 4 2 1 1 2 2 8 8 2 2 2 2 867 577 305 10 4 30 20 9 3 10 2 9 3 10 1 11 11 11 11 11 11 1 11 2 2 2 2 130 130 3 5 2 4 3 1 1 2 2 2 1 1 1 2 7 1 1 5 2 1 7 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 2 38 4 1 3 1 1 37 24 14 4 21 7 26 38 2 45 43 5 244 113 129 13 13 1 34 4 4 3 1 1 124 124 124 123 121 3 9 8 2 7 8 1 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> * * Fixes: * Alan Cox : Numerous verify_area() calls * Alan Cox : Set the ACK bit on a reset * Alan Cox : Stopped it crashing if it closed while * sk->inuse=1 and was trying to connect * (tcp_err()). * Alan Cox : All icmp error handling was broken * pointers passed where wrong and the * socket was looked up backwards. Nobody * tested any icmp error code obviously. * Alan Cox : tcp_err() now handled properly. It * wakes people on errors. poll * behaves and the icmp error race * has gone by moving it into sock.c * Alan Cox : tcp_send_reset() fixed to work for * everything not just packets for * unknown sockets. * Alan Cox : tcp option processing. * Alan Cox : Reset tweaked (still not 100%) [Had * syn rule wrong] * Herp Rosmanith : More reset fixes * Alan Cox : No longer acks invalid rst frames. * Acking any kind of RST is right out. * Alan Cox : Sets an ignore me flag on an rst * receive otherwise odd bits of prattle * escape still * Alan Cox : Fixed another acking RST frame bug. * Should stop LAN workplace lockups. * Alan Cox : Some tidyups using the new skb list * facilities * Alan Cox : sk->keepopen now seems to work * Alan Cox : Pulls options out correctly on accepts * Alan Cox : Fixed assorted sk->rqueue->next errors * Alan Cox : PSH doesn't end a TCP read. Switched a * bit to skb ops. * Alan Cox : Tidied tcp_data to avoid a potential * nasty. * Alan Cox : Added some better commenting, as the * tcp is hard to follow * Alan Cox : Removed incorrect check for 20 * psh * Michael O'Reilly : ack < copied bug fix. * Johannes Stille : Misc tcp fixes (not all in yet). * Alan Cox : FIN with no memory -> CRASH * Alan Cox : Added socket option proto entries. * Also added awareness of them to accept. * Alan Cox : Added TCP options (SOL_TCP) * Alan Cox : Switched wakeup calls to callbacks, * so the kernel can layer network * sockets. * Alan Cox : Use ip_tos/ip_ttl settings. * Alan Cox : Handle FIN (more) properly (we hope). * Alan Cox : RST frames sent on unsynchronised * state ack error. * Alan Cox : Put in missing check for SYN bit. * Alan Cox : Added tcp_select_window() aka NET2E * window non shrink trick. * Alan Cox : Added a couple of small NET2E timer * fixes * Charles Hedrick : TCP fixes * Toomas Tamm : TCP window fixes * Alan Cox : Small URG fix to rlogin ^C ack fight * Charles Hedrick : Rewrote most of it to actually work * Linus : Rewrote tcp_read() and URG handling * completely * Gerhard Koerting: Fixed some missing timer handling * Matthew Dillon : Reworked TCP machine states as per RFC * Gerhard Koerting: PC/TCP workarounds * Adam Caldwell : Assorted timer/timing errors * Matthew Dillon : Fixed another RST bug * Alan Cox : Move to kernel side addressing changes. * Alan Cox : Beginning work on TCP fastpathing * (not yet usable) * Arnt Gulbrandsen: Turbocharged tcp_check() routine. * Alan Cox : TCP fast path debugging * Alan Cox : Window clamping * Michael Riepe : Bug in tcp_check() * Matt Dillon : More TCP improvements and RST bug fixes * Matt Dillon : Yet more small nasties remove from the * TCP code (Be very nice to this man if * tcp finally works 100%) 8) * Alan Cox : BSD accept semantics. * Alan Cox : Reset on closedown bug. * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). * Michael Pall : Handle poll() after URG properly in * all cases. * Michael Pall : Undo the last fix in tcp_read_urg() * (multi URG PUSH broke rlogin). * Michael Pall : Fix the multi URG PUSH problem in * tcp_readable(), poll() after URG * works now. * Michael Pall : recv(...,MSG_OOB) never blocks in the * BSD api. * Alan Cox : Changed the semantics of sk->socket to * fix a race and a signal problem with * accept() and async I/O. * Alan Cox : Relaxed the rules on tcp_sendto(). * Yury Shevchuk : Really fixed accept() blocking problem. * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for * clients/servers which listen in on * fixed ports. * Alan Cox : Cleaned the above up and shrank it to * a sensible code size. * Alan Cox : Self connect lockup fix. * Alan Cox : No connect to multicast. * Ross Biro : Close unaccepted children on master * socket close. * Alan Cox : Reset tracing code. * Alan Cox : Spurious resets on shutdown. * Alan Cox : Giant 15 minute/60 second timer error * Alan Cox : Small whoops in polling before an * accept. * Alan Cox : Kept the state trace facility since * it's handy for debugging. * Alan Cox : More reset handler fixes. * Alan Cox : Started rewriting the code based on * the RFC's for other useful protocol * references see: Comer, KA9Q NOS, and * for a reference on the difference * between specifications and how BSD * works see the 4.4lite source. * A.N.Kuznetsov : Don't time wait on completion of tidy * close. * Linus Torvalds : Fin/Shutdown & copied_seq changes. * Linus Torvalds : Fixed BSD port reuse to work first syn * Alan Cox : Reimplemented timers as per the RFC * and using multiple timers for sanity. * Alan Cox : Small bug fixes, and a lot of new * comments. * Alan Cox : Fixed dual reader crash by locking * the buffers (much like datagram.c) * Alan Cox : Fixed stuck sockets in probe. A probe * now gets fed up of retrying without * (even a no space) answer. * Alan Cox : Extracted closing code better * Alan Cox : Fixed the closing state machine to * resemble the RFC. * Alan Cox : More 'per spec' fixes. * Jorge Cwik : Even faster checksumming. * Alan Cox : tcp_data() doesn't ack illegal PSH * only frames. At least one pc tcp stack * generates them. * Alan Cox : Cache last socket. * Alan Cox : Per route irtt. * Matt Day : poll()->select() match BSD precisely on error * Alan Cox : New buffers * Marc Tamsky : Various sk->prot->retransmits and * sk->retransmits misupdating fixed. * Fixed tcp_write_timeout: stuck close, * and TCP syn retries gets used now. * Mark Yarvis : In tcp_read_wakeup(), don't send an * ack if state is TCP_CLOSED. * Alan Cox : Look up device on a retransmit - routes may * change. Doesn't yet cope with MSS shrink right * but it's a start! * Marc Tamsky : Closing in closing fixes. * Mike Shaver : RFC1122 verifications. * Alan Cox : rcv_saddr errors. * Alan Cox : Block double connect(). * Alan Cox : Small hooks for enSKIP. * Alexey Kuznetsov: Path MTU discovery. * Alan Cox : Support soft errors. * Alan Cox : Fix MTU discovery pathological case * when the remote claims no mtu! * Marc Tamsky : TCP_CLOSE fix. * Colin (G3TNE) : Send a reset on syn ack replies in * window but wrong (fixes NT lpd problems) * Pedro Roque : Better TCP window handling, delayed ack. * Joerg Reuter : No modification of locked buffers in * tcp_do_retransmit() * Eric Schenk : Changed receiver side silly window * avoidance algorithm to BSD style * algorithm. This doubles throughput * against machines running Solaris, * and seems to result in general * improvement. * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * Keith Owens : Do proper merging with partial SKB's in * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). * Andi Kleen : Make poll agree with SIGIO * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and * lingertime == 0 (RFC 793 ABORT Call) * Hirokazu Takahashi : Use copy_from_user() instead of * csum_and_copy_from_user() if possible. * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack * * TCP_SYN_RECV received a connection request, sent ack, * waiting for final ack in three-way handshake. * * TCP_ESTABLISHED connection established * * TCP_FIN_WAIT1 our side has shutdown, waiting to complete * transmission of remaining buffered data * * TCP_FIN_WAIT2 all buffered data sent, waiting for remote * to shutdown * * TCP_CLOSING both sides have shutdown but we still have * data we have to finish sending * * TCP_TIME_WAIT timeout to catch resent junk before entering * closed, can only be entered from FIN_WAIT2 * or CLOSING. Required because the other end * may not have gotten our last ACK causing it * to retransmit the data packet (which we ignore) * * TCP_CLOSE_WAIT remote side has shutdown and is waiting for * us to finish writing our data and to shutdown * (we have to close() to move on to LAST_ACK) * * TCP_LAST_ACK out side has shutdown after remote has * shutdown. There may still be data in our * buffer that we have to finish sending * * TCP_CLOSE socket is finished */ #define pr_fmt(fmt) "TCP: " fmt #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/inet_diag.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> #include <linux/scatterlist.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/random.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/cache.h> #include <linux/err.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/errqueue.h> #include <linux/static_key.h> #include <linux/btf.h> #include <net/icmp.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/mptcp.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/sock.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> /* Track pending CMSGs. */ enum { TCP_CMSG_INQ = 1, TCP_CMSG_TS = 2 }; DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); #if IS_ENABLED(CONFIG_SMC) DEFINE_STATIC_KEY_FALSE(tcp_have_smc); EXPORT_SYMBOL(tcp_have_smc); #endif /* * Current number of TCP sockets. */ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(tcp_sockets_allocated); /* * TCP splice context */ struct tcp_splice_state { struct pipe_inode_info *pipe; size_t len; unsigned int flags; }; /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; if (!val) val--; if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); } EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure); void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, jiffies_to_msecs(jiffies - val)); } EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) { u8 res = 0; if (seconds > 0) { int period = timeout; res = 1; while (seconds > period && res < 255) { res++; timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return res; } /* Convert retransmits to seconds based on initial and max timeout */ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) { int period = 0; if (retrans > 0) { period = timeout; while (--retrans) { timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return period; } static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) { u32 rate = READ_ONCE(tp->rate_delivered); u32 intv = READ_ONCE(tp->rate_interval_us); u64 rate64 = 0; if (rate && intv) { rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; do_div(rate64, intv); } return rate64; } /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ tcp_snd_cwnd_set(tp, TCP_INIT_CWND); /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_sync_mss = tcp_sync_mss; WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); tcp_scaling_ratio_init(sk); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) tcb->txstamp_ack = 1; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } } static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) return true; return sk_is_readable(sk); } /* * Wait for a TCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); u8 shutdown; int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events * by poll logic and correct handling of state changes * made by other threads is impossible in any case. */ mask = 0; /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a * socket the read side is more interesting. * * Some poll() documentation says that EPOLLHUP is incompatible * with the EPOLLOUT/POLLWR flags, so somebody should check this * all. But careful, it tends to be safer to return too many * bits than too few, and you can easily break real applications * if you don't tell them that something has hung up! * * Check-me. * * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and * our fs/select.c). It means that after we received EOF, * poll always returns immediately, making impossible poll() on write() * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX * solve this dilemma. I would prefer, if EPOLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains * why EPOLLHUP is incompatible with EPOLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT && (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); u16 urg_data = READ_ONCE(tp->urg_data); if (unlikely(urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE)) target++; if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; if (!(shutdown & SEND_SHUTDOWN)) { if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. Memory barrier * pairs with the input side. */ smp_mb__after_atomic(); if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else mask |= EPOLLOUT | EPOLLWRNORM; if (urg_data & TCP_URG_VALID) mask |= EPOLLPRI; } else if (state == TCP_SYN_SENT && inet_test_bit(DEFER_CONNECT, sk)) { /* Active TCP fastopen socket with defer_connect * Return EPOLLOUT so application can call write() * in order for kernel to generate SYN+data */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; } EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, int *karg) { struct tcp_sock *tp = tcp_sk(sk); int answ; bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; slow = lock_sock_fast(sk); answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: answ = READ_ONCE(tp->urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; } *karg = answ; return 0; } EXPORT_SYMBOL(tcp_ioctl); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; } static inline bool forced_push(const struct tcp_sock *tp) { return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) { if (flags & MSG_OOB) tp->snd_up = tp->write_seq; } /* If a not yet filled skb is pushed, do not send it if * we have data packets in Qdisc or NIC queues : * Because TX completion will happen shortly, it gives a chance * to coalesce future sendmsg() payload into this skb, without * need for a timer, and with no latency trade off. * As packets containing data payload have a bigger truesize * than pure acks (dataless) packets, the last checks prevent * autocorking if we only have an ACK in Qdisc/NIC queues, * or if TX completion was delayed after we processed ACK packet. */ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && !tcp_rtx_queue_empty(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize && tcp_skb_can_collapse_to(skb); } void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; skb = tcp_write_queue_tail(sk); if (!skb) return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); tcp_mark_urg(tp, flags); if (tcp_should_autocork(sk, skb, size_goal)) { /* avoid atomic op if TSQ_THROTTLED bit is already set */ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); smp_mb__after_atomic(); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. */ if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize) return; } if (flags & MSG_MORE) nonagle = TCP_NAGLE_CORK; __tcp_push_pending_frames(sk, mss_now, nonagle); } static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; int ret; ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, min(rd_desc->count, len), tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; } static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) { /* Store TCP splice context information in read_descriptor_t. */ read_descriptor_t rd_desc = { .arg.data = tss, .count = tss->len, }; return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); } /** * tcp_splice_read - splice data from TCP socket to a pipe * @sock: socket to splice from * @ppos: position (not valid) * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will read pages from given socket and fill them into a pipe. * **/ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct tcp_splice_state tss = { .pipe = pipe, .len = len, .flags = flags, }; long timeo; ssize_t spliced; int ret; sock_rps_record_flow(sk); /* * We can't seek on a socket input */ if (unlikely(*ppos)) return -ESPIPE; ret = spliced = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); while (tss.len) { ret = __tcp_splice_read(sk, &tss); if (ret < 0) break; else if (!ret) { if (spliced) break; if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { ret = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* * This occurs when user tries to read * from never connected socket. */ ret = -ENOTCONN; break; } if (!timeo) { ret = -EAGAIN; break; } /* if __tcp_splice_read() got nothing while we have * an skb in receive queue, we do not want to loop. * This might happen with URG data. */ if (!skb_queue_empty(&sk->sk_receive_queue)) break; ret = sk_wait_data(sk, &timeo, NULL); if (ret < 0) break; if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } continue; } tss.len -= ret; spliced += ret; if (!tss.len || !timeo) break; release_sock(sk); lock_sock(sk); if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } release_sock(sk); if (spliced) return spliced; return ret; } EXPORT_SYMBOL(tcp_splice_read); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) { struct sk_buff *skb; skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { bool mem_scheduled; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); } else { mem_scheduled = sk_wmem_schedule(sk, skb->truesize); } if (likely(mem_scheduled)) { skb_reserve(skb, MAX_TCP_HEADER); skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; } static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); u32 new_size_goal, size_goal; if (!large_allowed) return mss_now; /* Note : tcp_tso_autosize() will eventually split this later */ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size); /* We try hard to avoid divides here */ size_goal = tp->gso_segs * mss_now; if (unlikely(new_size_goal < size_goal || new_size_goal >= size_goal + mss_now)) { tp->gso_segs = min_t(u16, new_size_goal / mss_now, sk->sk_gso_max_segs); size_goal = tp->gso_segs * mss_now; } return max(size_goal, mss_now); } int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; mss_now = tcp_current_mss(sk); *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); return mss_now; } /* In some cases, sendmsg() could have added an skb to the write queue, * but failed adding payload on it. We need to remove it to consume less * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger * epoll() users. Another reason is that tcp_write_xmit() does not like * finding an empty skb in the write queue. */ void tcp_remove_empty_skb(struct sock *sk) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); tcp_wmem_free_skb(sk, skb); } } /* skb changing from pure zc to mixed, must charge zc */ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) { if (unlikely(skb_zcopy_pure(skb))) { u32 extra = skb->truesize - SKB_TRUESIZE(skb_end_offset(skb)); if (!sk_wmem_schedule(sk, extra)) return -ENOMEM; sk_mem_charge(sk, extra); skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; } return 0; } int tcp_wmem_schedule(struct sock *sk, int copy) { int left; if (likely(sk_wmem_schedule(sk, copy))) return copy; /* We could be in trouble if we have nothing queued. * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] * to guarantee some progress. */ left = sock_net(sk)->ipv4.sysctl_tcp_wmem[0] - sk->sk_wmem_queued; if (left > 0) sk_forced_mem_schedule(sk, min(left, copy)); return min(copy, sk->sk_forward_alloc); } void tcp_free_fastopen_req(struct tcp_sock *tp) { if (tp->fastopen_req) { kfree(tp->fastopen_req); tp->fastopen_req = NULL; } } int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sockaddr *uaddr = msg->msg_name; int err, flags; if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), sk->sk_allocation); if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; tp->fastopen_req->uarg = uarg; if (inet_test_bit(DEFER_CONNECT, sk)) { err = tcp_connect(sk); /* Same failure procedure as in tcp_v4/6_connect */ if (err) { tcp_set_state(sk, TCP_CLOSE); inet->inet_dport = 0; sk->sk_route_caps = 0; } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst */ if (tp->fastopen_req) { *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); } return err; } int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; int zc = 0; long timeo; flags = msg->msg_flags; if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; goto out_err; } if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) goto out_err; } timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto do_error; } if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); goto out_nopush; } err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out_err; /* 'common' sending to sendq */ } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { err = -EINVAL; goto out_err; } } /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Ok commence sending. */ copied = 0; restart: mss_now = tcp_send_mss(sk, &size_goal, flags); err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; while (msg_data_left(msg)) { ssize_t copy = 0; skb = tcp_write_queue_tail(sk); if (skb) copy = size_goal - skb->len; if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_space; if (unlikely(process_backlog >= 16)) { process_backlog = 0; if (sk_flush_backlog(sk)) goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); skb = tcp_stream_alloc_skb(sk, sk->sk_allocation, first_skb); if (!skb) goto wait_for_space; process_backlog++; tcp_skb_entail(sk, skb); copy = size_goal; /* All packets are restored as if they have * already been sent. skb_mstamp_ns isn't set to * avoid wrong rtt estimation. */ if (tp->repair) TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; } /* Try to append data to the end of skb. */ if (copy > msg_data_left(msg)) copy = msg_data_left(msg); if (zc == 0) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_space; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { if (i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } merge = false; } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) { if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; skb_zcopy_downgrade_managed(skb); } copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, copy); if (err) goto do_error; /* Update the skb. */ if (merge) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); page_ref_inc(pfrag->page); } pfrag->offset += copy; } else if (zc == MSG_ZEROCOPY) { /* First append to a fragless skb builds initial * pure zerocopy skb */ if (!skb->len) skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; if (!skb_zcopy_pure(skb)) { copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; } err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; } if (err < 0) goto do_error; copy = err; } else if (zc == MSG_SPLICE_PAGES) { /* Splice in data if we can; copy if we can't. */ if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) { if (err == -EMSGSIZE) { tcp_mark_push(tp, skb); goto new_segment; } goto do_error; } copy = err; if (!(flags & MSG_NO_SHARED_FRAGS)) skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); } if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); copied += copy; if (!msg_data_left(msg)) { if (unlikely(flags & MSG_EOR)) TCP_SKB_CB(skb)->eor = 1; goto out; } if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_remove_empty_skb(sk); if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); } out: if (copied) { tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); return copied + copied_syn; do_error: tcp_remove_empty_skb(sk); if (copied + copied_syn) goto out; out_err: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int ret; lock_sock(sk); ret = tcp_sendmsg_locked(sk, msg, size); release_sock(sk); return ret; } EXPORT_SYMBOL(tcp_sendmsg); void tcp_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); int mss_now, size_goal; if (!tcp_write_queue_tail(sk)) return; lock_sock(sk); mss_now = tcp_send_mss(sk, &size_goal, 0); tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); release_sock(sk); } EXPORT_SYMBOL_GPL(tcp_splice_eof); /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) */ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); /* No URG data to read. */ if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || tp->urg_data == TCP_URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) return -ENOTCONN; if (tp->urg_data & TCP_URG_VALID) { int err = 0; char c = tp->urg_data; if (!(flags & MSG_PEEK)) WRITE_ONCE(tp->urg_data, TCP_URG_READ); /* Read urgent data. */ msg->msg_flags |= MSG_OOB; if (len > 0) { if (!(flags & MSG_TRUNC)) err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; return err ? -EFAULT : len; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 0; /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the * blocking state of the socket. * Mike <pall@rz.uni-karlsruhe.de> */ return -EAGAIN; } static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) { struct sk_buff *skb; int copied = 0, err = 0; /* XXX -- need to support SO_PEEK_OFF */ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) return err; copied += skb->len; } skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; copied += skb->len; } return err ?: copied; } /* Clean up the receive buffer for full frames taken by the user, * then send an ACK if necessary. COPIED is the number of bytes * tcp_recvmsg has given to the user so far, it speeds up the * calculation of whether or not we must ACK for the sake of * a window update. */ void __tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); if (/* Once-per-two-segments ACK was not sent by tcp_input.c */ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue. */ (copied > 0 && ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !inet_csk_in_pingpong_mode(sk))) && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = true; } /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". * * Even if window raised up to infinity, do not send window open ACK * in states, where we will not receive more. It is useless. */ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { __u32 rcv_window_now = tcp_receive_window(tp); /* Optimize, __tcp_select_window() is not cheap. */ if (2*rcv_window_now <= tp->window_clamp) { __u32 new_window = __tcp_select_window(sk); /* Send ACK now, if this read freed lots of space * in our buffer. Certainly, new_window is new window. * We can advertise it now, if it is not less than current one. * "Lots" means "at least twice" here. */ if (new_window && new_window >= 2 * rcv_window_now) time_to_ack = true; } } if (time_to_ack) tcp_send_ack(sk); } void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tcp_sock *tp = tcp_sk(sk); WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); __tcp_cleanup_rbuf(sk, copied); } static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); if (likely(skb->destructor == sock_rfree)) { sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; return skb_attempt_defer_free(skb); } __kfree_skb(skb); } struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } /* This looks weird, but this can happen if TCP collapsing * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ tcp_eat_recv_skb(sk, skb); } return NULL; } EXPORT_SYMBOL(tcp_recv_skb); /* * This routine provides an alternative to tcp_recvmsg() for routines * that would like to handle copying from skbuffs directly in 'sendfile' * fashion. * Note: * - It is assumed that the socket was locked by the caller. * - The routine does not block. * - At present, there is no support for reading OOB data * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; u32 offset; int copied = 0; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { if (offset < skb->len) { int used; size_t len; len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - seq; if (urg_offset < len) len = urg_offset; if (!len) break; } used = recv_actor(desc, skb, offset, len); if (used <= 0) { if (!copied) copied = used; break; } if (WARN_ON_ONCE(used > len)) used = len; seq += used; copied += used; offset += used; /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it * while aggregating skbs from the socket queue. */ skb = tcp_recv_skb(sk, seq - 1, &offset); if (!skb) break; /* TCP coalescing might have appended data to the skb. * Try to splice more frags */ if (offset + 1 != skb->len) continue; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_eat_recv_skb(sk, skb); ++seq; break; } tcp_eat_recv_skb(sk, skb); if (!desc->count) break; WRITE_ONCE(tp->copied_seq, seq); } WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (copied > 0) { tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } return copied; } EXPORT_SYMBOL(tcp_read_sock); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; int copied = 0; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { u8 tcp_flags; int used; __skb_unlink(skb, &sk->sk_receive_queue); WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); tcp_flags = TCP_SKB_CB(skb)->tcp_flags; used = recv_actor(sk, skb); if (used < 0) { if (!copied) copied = used; break; } copied += used; if (tcp_flags & TCPHDR_FIN) break; } return copied; } EXPORT_SYMBOL(tcp_read_skb); void tcp_read_done(struct sock *sk, size_t len) { struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; struct sk_buff *skb; size_t left; u32 offset; if (sk->sk_state == TCP_LISTEN) return; left = len; while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { int used; used = min_t(size_t, skb->len - offset, left); seq += used; left -= used; if (skb->len > offset + used) break; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_eat_recv_skb(sk, skb); ++seq; break; } tcp_eat_recv_skb(sk, skb); } WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (left != len) tcp_cleanup_rbuf(sk, len - left); } EXPORT_SYMBOL(tcp_read_done); int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); } EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, space); tcp_sk(sk)->window_clamp = val; } return 0; } EXPORT_SYMBOL(tcp_set_rcvlowat); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss) { if (skb->tstamp) tss->ts[0] = ktime_to_timespec64(skb->tstamp); else tss->ts[0] = (struct timespec64) {0}; if (skb_hwtstamps(skb)->hwtstamp) tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); else tss->ts[2] = (struct timespec64) {0}; } #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC); /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ vm_flags_set(vma, VM_MIXEDMAP); vma->vm_ops = &tcp_vm_ops; return 0; } EXPORT_SYMBOL(tcp_mmap); static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, u32 *offset_frag) { skb_frag_t *frag; if (unlikely(offset_skb >= skb->len)) return NULL; offset_skb -= skb_headlen(skb); if ((int)offset_skb < 0 || skb_has_frag_list(skb)) return NULL; frag = skb_shinfo(skb)->frags; while (offset_skb) { if (skb_frag_size(frag) > offset_skb) { *offset_frag = offset_skb; return frag; } offset_skb -= skb_frag_size(frag); ++frag; } *offset_frag = 0; return frag; } static bool can_map_frag(const skb_frag_t *frag) { struct page *page; if (skb_frag_size(frag) != PAGE_SIZE || skb_frag_off(frag)) return false; page = skb_frag_page(frag); if (PageCompound(page) || page->mapping) return false; return true; } static int find_next_mappable_frag(const skb_frag_t *frag, int remaining_in_skb) { int offset = 0; if (likely(can_map_frag(frag))) return 0; while (offset < remaining_in_skb && !can_map_frag(frag)) { offset += skb_frag_size(frag); ++frag; } return offset; } static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 offset) { u32 frag_offset, partial_frag_remainder = 0; int mappable_offset; skb_frag_t *frag; /* worst case: skip to next skb. try to improve on this case below */ zc->recv_skip_hint = skb->len - offset; /* Find the frag containing this offset (and how far into that frag) */ frag = skb_advance_to_frag(skb, offset, &frag_offset); if (!frag) return; if (frag_offset) { struct skb_shared_info *info = skb_shinfo(skb); /* We read part of the last frag, must recvmsg() rest of skb. */ if (frag == &info->frags[info->nr_frags - 1]) return; /* Else, we must at least read the remainder in this frag. */ partial_frag_remainder = skb_frag_size(frag) - frag_offset; zc->recv_skip_hint -= partial_frag_remainder; ++frag; } /* partial_frag_remainder: If part way through a frag, must read rest. * mappable_offset: Bytes till next mappable frag, *not* counting bytes * in partial_frag_remainder. */ mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint); zc->recv_skip_hint = mappable_offset + partial_frag_remainder; } static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags); static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; int err; zc->length = 0; zc->recv_skip_hint = 0; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq, &msg.msg_iter); if (err) return err; err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT, tss, &zc->msg_flags); if (err < 0) return err; zc->copybuf_len = err; if (likely(zc->copybuf_len)) { struct sk_buff *skb; u32 offset; skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset); if (skb) tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset); } return 0; } static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 copylen, u32 *offset, u32 *seq) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; int err; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen, &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); if (err) return err; zc->recv_skip_hint -= copylen; *offset += copylen; *seq += copylen; return (__s32)copylen; } static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, struct sock *sk, struct sk_buff *skb, u32 *seq, s32 copybuf_len, struct scm_timestamping_internal *tss) { u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); if (!copylen) return 0; /* skb is null if inq < PAGE_SIZE. */ if (skb) { offset = *seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, *seq, &offset); if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } } zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, seq); return zc->copybuf_len < 0 ? 0 : copylen; } static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, struct page **pending_pages, unsigned long pages_remaining, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map, int err) { /* At least one page did not map. Try zapping if we skipped earlier. */ if (err == -EBUSY && zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { u32 maybe_zap_len; maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ zap_page_range_single(vma, *address, maybe_zap_len, NULL); err = 0; } if (!err) { unsigned long leftover_pages = pages_remaining; int bytes_mapped; /* We called zap_page_range_single, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); *seq += bytes_mapped; *address += bytes_mapped; } if (err) { /* Either we were unable to zap, OR we zapped, retried an * insert, and still had an issue. Either ways, pages_remaining * is the number of pages we were unable to map, and we unroll * some state we speculatively touched before. */ const int bytes_not_mapped = PAGE_SIZE * pages_remaining; *length -= bytes_not_mapped; zc->recv_skip_hint += bytes_not_mapped; } return err; } static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, unsigned int pages_to_map, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map) { unsigned long pages_remaining = pages_to_map; unsigned int pages_mapped; unsigned int bytes_mapped; int err; err = vm_insert_pages(vma, *address, pages, &pages_remaining); pages_mapped = pages_to_map - (unsigned int)pages_remaining; bytes_mapped = PAGE_SIZE * pages_mapped; /* Even if vm_insert_pages fails, it may have partially succeeded in * mapping (some but not all of the pages). */ *seq += bytes_mapped; *address += bytes_mapped; if (likely(!err)) return 0; /* Error: maybe zap and retry + rollback state for failed inserts. */ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, pages_remaining, address, length, seq, zc, total_bytes_to_map, err); } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { unsigned long msg_control_addr; struct msghdr cmsg_dummy; msg_control_addr = (unsigned long)zc->msg_control; cmsg_dummy.msg_control_user = (void __user *)msg_control_addr; cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; cmsg_dummy.msg_control_is_user = true; zc->msg_flags = 0; if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { tcp_recv_timestamp(&cmsg_dummy, sk, tss); zc->msg_control = (__u64) ((uintptr_t)cmsg_dummy.msg_control_user); zc->msg_controllen = (__u64)cmsg_dummy.msg_controllen; zc->msg_flags = (__u32)cmsg_dummy.msg_flags; } } static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, unsigned long address, bool *mmap_locked) { struct vm_area_struct *vma = lock_vma_under_rcu(mm, address); if (vma) { if (vma->vm_ops != &tcp_vm_ops) { vma_end_read(vma); return NULL; } *mmap_locked = false; return vma; } mmap_read_lock(mm); vma = vma_lookup(mm, address); if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(mm); return NULL; } *mmap_locked = true; return vma; } #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); const skb_frag_t *frags = NULL; unsigned int pages_to_map = 0; struct vm_area_struct *vma; struct sk_buff *skb = NULL; u32 seq = tp->copied_seq; u32 total_bytes_to_map; int inq = tcp_inq(sk); bool mmap_locked; int ret; zc->copybuf_len = 0; zc->msg_flags = 0; if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; sock_rps_record_flow(sk); if (inq && inq <= copybuf_len) return receive_fallback_to_copy(sk, zc, inq, tss); if (inq < PAGE_SIZE) { zc->length = 0; zc->recv_skip_hint = inq; if (!inq && sock_flag(sk, SOCK_DONE)) return -EIO; return 0; } vma = find_tcp_vma(current->mm, address, &mmap_locked); if (!vma) return -EINVAL; vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); avail_len = min_t(u32, vma_len, inq); total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) zap_page_range_single(vma, address, total_bytes_to_map, NULL); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { zc->length = avail_len; zc->recv_skip_hint = avail_len; } ret = 0; while (length + PAGE_SIZE <= zc->length) { int mappable_offset; struct page *page; if (zc->recv_skip_hint < PAGE_SIZE) { u32 offset_frag; if (skb) { if (zc->recv_skip_hint > 0) break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, seq, &offset); } if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } zc->recv_skip_hint = skb->len - offset; frags = skb_advance_to_frag(skb, offset, &offset_frag); if (!frags || offset_frag) break; } mappable_offset = find_next_mappable_frag(frags, zc->recv_skip_hint); if (mappable_offset) { zc->recv_skip_hint = mappable_offset; break; } page = skb_frag_page(frags); prefetchw(page); pages[pages_to_map++] = page; length += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || zc->recv_skip_hint < PAGE_SIZE) { /* Either full batch, or we're about to go to next skb * (and we cannot unroll failed ops across skbs). */ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); if (ret) goto out; pages_to_map = 0; } } if (pages_to_map) { ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); } out: if (mmap_locked) mmap_read_unlock(current->mm); else vma_end_read(vma); /* Try to copy straggler data. */ if (!ret) copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, length + copylen); ret = 0; if (length == zc->length) zc->recv_skip_hint = 0; } else { if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) ret = -EIO; } zc->length = length; return ret; } #endif /* Similar to __sock_recv_timestamp, but does not require an skb */ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { if (sock_flag(sk, SOCK_RCVTSTAMP)) { if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_timespec kts = { .tv_sec = tss->ts[0].tv_sec, .tv_nsec = tss->ts[0].tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { struct __kernel_old_timespec ts_old = { .tv_sec = tss->ts[0].tv_sec, .tv_nsec = tss->ts[0].tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts_old), &ts_old); } } else { if (new_tstamp) { struct __kernel_sock_timeval stv = { .tv_sec = tss->ts[0].tv_sec, .tv_usec = tss->ts[0].tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(stv), &stv); } else { struct __kernel_old_timeval tv = { .tv_sec = tss->ts[0].tv_sec, .tv_usec = tss->ts[0].tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } } } if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE) has_timestamping = true; else tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE) has_timestamping = true; else tss->ts[2] = (struct timespec64) {0}; } if (has_timestamping) { tss->ts[1] = (struct timespec64) {0}; if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, tss); else put_cmsg_scm_timestamping(msg, tss); } } static int tcp_inq_hint(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u32 copied_seq = READ_ONCE(tp->copied_seq); u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); int inq; inq = rcv_nxt - copied_seq; if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { lock_sock(sk); inq = tp->rcv_nxt - tp->copied_seq; release_sock(sk); } /* After receiving a FIN, tell the user-space to continue reading * by returning a non-zero inq. */ if (inq == 0 && sock_flag(sk, SOCK_DONE)) inq = 1; return inq; } /* * This routine copies from a sock struct into the user buffer. * * Technical note: in 2.3 we work on _locked_ socket, so that * tricks with *seq access order and skb->users are not required. * Probably, code can be easily improved even more. */ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); int copied = 0; u32 peek_seq; u32 *seq; unsigned long used; int err; int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; u32 urg_hole = 0; err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; if (tp->recvmsg_inq) { *cmsg_flags = TCP_CMSG_INQ; msg->msg_get_inq = 1; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) goto recv_urg; if (unlikely(tp->repair)) { err = -EPERM; if (!(flags & MSG_PEEK)) goto out; if (tp->repair_queue == TCP_SEND_QUEUE) goto recv_sndq; err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out; /* 'common' recv queue MSG_PEEK-ing */ } seq = &tp->copied_seq; if (flags & MSG_PEEK) { peek_seq = tp->copied_seq; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); do { u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } } /* Next get a buffer. */ last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { last = skb; /* Now that we have two receive queues this * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; offset = *seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (!timeo || sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } if (copied >= target) { /* Do not sleep, just process backlog. */ __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); err = sk_wait_data(sk, &timeo, last); if (err < 0) { err = copied ? : err; goto out; } } if ((flags & MSG_PEEK) && (peek_seq - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = tp->copied_seq; } continue; found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; /* Do we have urgent data here? */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; if (!used) goto skip_copy; } } else used = urg_offset; } } if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; tcp_rcv_space_adjust(sk); skip_copy: if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) { WRITE_ONCE(tp->urg_data, 0); tcp_fast_path_check(sk); } if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= TCP_CMSG_TS; } if (used + offset < skb->len) continue; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) tcp_eat_recv_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) tcp_eat_recv_skb(sk, skb); break; } while (len > 0); /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); return copied; out: return err; recv_urg: err = tcp_recv_urg(sk, msg, len, flags); goto out; recv_sndq: err = tcp_peek_sndq(sk, msg, len); goto out; } int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { int cmsg_flags = 0, ret; struct scm_timestamping_internal tss; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && sk->sk_state == TCP_ESTABLISHED) sk_busy_loop(sk, flags & MSG_DONTWAIT); lock_sock(sk); ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags); release_sock(sk); if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if (msg->msg_get_inq) { msg->msg_inq = tcp_inq_hint(sk); if (cmsg_flags & TCP_CMSG_INQ) put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(msg->msg_inq), &msg->msg_inq); } } return ret; } EXPORT_SYMBOL(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; /* We defined a new enum for TCP states that are exported in BPF * so as not force the internal TCP states to be frozen. The * following checks will detect if an internal state value ever * differs from the BPF value. If this ever happens, then we will * need to remap the internal value to the BPF value before calling * tcp_call_bpf_2arg. */ BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); /* bpf uapi header bpf.h defines an anonymous enum with values * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux * is able to emit this enum in DWARF due to the above BUILD_BUG_ON. * But clang built vmlinux does not have this enum in DWARF * since clang removes the above code before generating IR/debuginfo. * Let us explicitly emit the type debuginfo to ensure the * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF * regardless of which compiler is used. */ BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED); if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == TCP_ESTABLISHED) TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_state_store(sk, state); } EXPORT_SYMBOL_GPL(tcp_set_state); /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be * closed. */ static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; tcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } /* * Shutdown the sending side of a connection. Much like close except * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). */ void tcp_shutdown(struct sock *sk, int how) { /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. */ if (!(how & SEND_SHUTDOWN)) return; /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk)) tcp_send_fin(sk); } } EXPORT_SYMBOL(tcp_shutdown); int tcp_orphan_count_sum(void) { int i, total = 0; for_each_possible_cpu(i) total += per_cpu(tcp_orphan_count, i); return max(total, 0); } static int tcp_orphan_cache; static struct timer_list tcp_orphan_timer; #define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100) static void tcp_orphan_update(struct timer_list *unused) { WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum()); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); } static bool tcp_too_many_orphans(int shift) { return READ_ONCE(tcp_orphan_cache) << shift > READ_ONCE(sysctl_tcp_max_orphans); } bool tcp_check_oom(struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; too_many_orphans = tcp_too_many_orphans(shift); out_of_socket_memory = tcp_out_of_memory(sk); if (too_many_orphans) net_info_ratelimited("too many orphaned sockets\n"); if (out_of_socket_memory) net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); return too_many_orphans || out_of_socket_memory; } void __tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; int state; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } /* We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) len--; data_was_unread += len; __kfree_skb(skb); } /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk * GET in an FTP client, suspend the process, wait for the client to * advertise a zero window, then kill -9 the FTP client, wheee... * Note: timeout is always zero in such a case. */ if (unlikely(tcp_sk(sk)->repair)) { sk->sk_prot->disconnect(sk, 0); } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. */ /* RED-PEN. Formally speaking, we have broken TCP state * machine. State transitions: * * TCP_ESTABLISHED -> TCP_FIN_WAIT1 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) * TCP_CLOSE_WAIT -> TCP_LAST_ACK * * are legal only when FIN has been sent (i.e. in window), * rather than queued out of window. Purists blame. * * F.e. "RFC state" is ESTABLISHED, * if Linux state is FIN-WAIT-1, but FIN is still not sent. * * The visible declinations are that sometimes * we enter time-wait state, when it is not required really * (harmless), do not send active resets, when they are * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when * they look as CLOSING or LAST_ACK for Linux) * Probably, I missed some more holelets. * --ANK * XXX (TFO) - To start off we don't support SYN+ACK+FIN * in a single packet! (May consider it later but will * probably need API support or TCP_CORK SYN-ACK until * data is written and socket is closed.) */ tcp_send_fin(sk); } sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); local_bh_disable(); bh_lock_sock(sk); /* remove backlog if any, without releasing ownership. */ __release_sock(sk); this_cpu_inc(tcp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) goto out; /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. * * Nope, it was not mistake. It is really desired behaviour * f.e. on http servers, when such sockets are useless, but * consume significant resources. Let's do it with special * linger2 option. --ANK */ if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } } if (sk->sk_state != TCP_CLOSE) { if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { /* Not possible to send reset; just close */ tcp_set_state(sk, TCP_CLOSE); } } if (sk->sk_state == TCP_CLOSE) { struct request_sock *req; req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. */ if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); } void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); /* These states need RST on ABORT according to RFC793 */ static inline bool tcp_need_reset(int state) { return (1 << state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } static void tcp_rtx_queue_purge(struct sock *sk) { struct rb_node *p = rb_first(&sk->tcp_rtx_queue); tcp_sk(sk)->highest_sack = NULL; while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); /* Since we are deleting whole queue, no need to * list_del(&skb->tcp_tsorted_anchor) */ tcp_rtx_queue_unlink(skb, sk); tcp_wmem_free_skb(sk, skb); } } void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; tcp_chrono_stop(sk, TCP_CHRONO_BUSY); while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { tcp_skb_tsorted_anchor_cleanup(skb); tcp_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; inet_csk(sk)->icsk_backoff = 0; } int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { inet_csk_listen_stop(sk); } else if (unlikely(tp->repair)) { WRITE_ONCE(sk->sk_err, ECONNABORTED); } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); WRITE_ONCE(tp->urg_data, 0); tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; inet_bhash2_reset_saddr(sk); WRITE_ONCE(sk->sk_shutdown, 0); sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; seq = tp->write_seq + tp->max_window + 2; if (!seq) seq = 1; WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; tp->is_cwnd_limited = 0; tp->max_packets_out = 0; tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); tp->total_retrans = 0; inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; tp->segs_out = 0; tp->bytes_sent = 0; tp->bytes_acked = 0; tp->bytes_received = 0; tp->bytes_retrans = 0; tp->data_segs_in = 0; tp->data_segs_out = 0; tp->duplicate_sack[0].start_seq = 0; tp->duplicate_sack[0].end_seq = 0; tp->dsack_dups = 0; tp->reord_seen = 0; tp->retrans_out = 0; tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; tp->rack.mstamp = 0; tp->rack.advanced = 0; tp->rack.reo_wnd_steps = 1; tp->rack.last_delivered = 0; tp->rack.reo_wnd_persist = 0; tp->rack.dsack_seen = 0; tp->syn_data_acked = 0; tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; } sk_error_report(sk); return 0; } EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && (sk->sk_state != TCP_LISTEN); } static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) { struct tcp_repair_window opt; if (!tp->repair) return -EPERM; if (len != sizeof(opt)) return -EINVAL; if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) return -EFAULT; if (opt.max_window < opt.snd_wnd) return -EINVAL; if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) return -EINVAL; if (after(opt.rcv_wup, tp->rcv_nxt)) return -EINVAL; tp->snd_wl1 = opt.snd_wl1; tp->snd_wnd = opt.snd_wnd; tp->max_window = opt.max_window; tp->rcv_wnd = opt.rcv_wnd; tp->rcv_wup = opt.rcv_wup; return 0; } static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; size_t offset = 0; while (len >= sizeof(opt)) { if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) return -EFAULT; offset += sizeof(opt); len -= sizeof(opt); switch (opt.opt_code) { case TCPOPT_MSS: tp->rx_opt.mss_clamp = opt.opt_val; tcp_mtup_init(sk); break; case TCPOPT_WINDOW: { u16 snd_wscale = opt.opt_val & 0xFFFF; u16 rcv_wscale = opt.opt_val >> 16; if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE) return -EFBIG; tp->rx_opt.snd_wscale = snd_wscale; tp->rx_opt.rcv_wscale = rcv_wscale; tp->rx_opt.wscale_ok = 1; } break; case TCPOPT_SACK_PERM: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.tstamp_ok = 1; break; } } return 0; } DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); EXPORT_SYMBOL(tcp_tx_delay_enabled); static void tcp_enable_tx_delay(void) { if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { static int __tcp_tx_delay_enabled = 0; if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { static_branch_enable(&tcp_tx_delay_enabled); pr_info("TCP_TX_DELAY enabled\n"); } } } /* When set indicates to always queue non-full frames. Later the user clears * this option and we transmit any pending partial frames in the queue. This is * meant to be used alongside sendfile() to get properly filled frames when the * user (for example) must write out headers with a write() call first and then * use sendfile to send out the data parts. * * TCP_CORK can be set together with TCP_NODELAY and it is stronger than * TCP_NODELAY. */ void __tcp_sock_set_cork(struct sock *sk, bool on) { struct tcp_sock *tp = tcp_sk(sk); if (on) { tp->nonagle |= TCP_NAGLE_CORK; } else { tp->nonagle &= ~TCP_NAGLE_CORK; if (tp->nonagle & TCP_NAGLE_OFF) tp->nonagle |= TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } } void tcp_sock_set_cork(struct sock *sk, bool on) { lock_sock(sk); __tcp_sock_set_cork(sk, on); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_cork); /* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is * remembered, but it is not activated until cork is cleared. * * However, when TCP_NODELAY is set we make an explicit push, which overrides * even TCP_CORK for currently queued segments. */ void __tcp_sock_set_nodelay(struct sock *sk, bool on) { if (on) { tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } else { tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; } } void tcp_sock_set_nodelay(struct sock *sk) { lock_sock(sk); __tcp_sock_set_nodelay(sk, true); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_nodelay); static void __tcp_sock_set_quickack(struct sock *sk, int val) { if (!val) { inet_csk_enter_pingpong_mode(sk); return; } inet_csk_exit_pingpong_mode(sk); if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) inet_csk_enter_pingpong_mode(sk); } } void tcp_sock_set_quickack(struct sock *sk, int val) { lock_sock(sk); __tcp_sock_set_quickack(sk, val); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_quickack); int tcp_sock_set_syncnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_SYNCNT) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_syncnt); int tcp_sock_set_user_timeout(struct sock *sk, int val) { /* Cap the max time in ms TCP will retry or probe the window * before giving up and aborting (ETIMEDOUT) a connection. */ if (val < 0) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_user_timeout); int tcp_sock_set_keepidle_locked(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); if (val < 1 || val > MAX_TCP_KEEPIDLE) return -EINVAL; /* Paired with WRITE_ONCE() in keepalive_time_when() */ WRITE_ONCE(tp->keepalive_time, val * HZ); if (sock_flag(sk, SOCK_KEEPOPEN) && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { u32 elapsed = keepalive_time_elapsed(tp); if (tp->keepalive_time > elapsed) elapsed = tp->keepalive_time - elapsed; else elapsed = 0; inet_csk_reset_keepalive_timer(sk, elapsed); } return 0; } int tcp_sock_set_keepidle(struct sock *sk, int val) { int err; lock_sock(sk); err = tcp_sock_set_keepidle_locked(sk, val); release_sock(sk); return err; } EXPORT_SYMBOL(tcp_sock_set_keepidle); int tcp_sock_set_keepintvl(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPINTVL) return -EINVAL; WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepintvl); int tcp_sock_set_keepcnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPCNT) return -EINVAL; /* Paired with READ_ONCE() in keepalive_probes() */ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepcnt); int tcp_set_window_clamp(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; tp->window_clamp = 0; } else { u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp; u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ? SOCK_MIN_RCVBUF / 2 : val; if (new_window_clamp == old_window_clamp) return 0; tp->window_clamp = new_window_clamp; if (new_window_clamp < old_window_clamp) { /* need to apply the reserved mem provisioning only * when shrinking the window clamp */ __tcp_adjust_rcv_ssthresh(sk, tp->window_clamp); } else { new_rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp); tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh); } } return 0; } /* * Socket option code for TCP. */ int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); int val; int err = 0; /* These are data/string values, all the others are ints */ switch (optname) { case TCP_CONGESTION: { char name[TCP_CA_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; sockopt_lock_sock(sk); err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(), sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); sockopt_release_sock(sk); return err; } case TCP_ULP: { char name[TCP_ULP_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_ULP_NAME_MAX - 1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; sockopt_lock_sock(sk); err = tcp_set_ulp(sk, name); sockopt_release_sock(sk); return err; } case TCP_FASTOPEN_KEY: { __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; __u8 *backup_key = NULL; /* Allow a backup key as well to facilitate key rotation * First key is the active one. */ if (optlen != TCP_FASTOPEN_KEY_LENGTH && optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; if (copy_from_sockptr(key, optval, optlen)) return -EFAULT; if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) backup_key = key + TCP_FASTOPEN_KEY_LENGTH; return tcp_fastopen_reset_cipher(net, sk, key, backup_key); } default: /* fallthru */ break; } if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; /* Handle options that can be set without locking the socket. */ switch (optname) { case TCP_SYNCNT: return tcp_sock_set_syncnt(sk, val); case TCP_USER_TIMEOUT: return tcp_sock_set_user_timeout(sk, val); case TCP_KEEPINTVL: return tcp_sock_set_keepintvl(sk, val); case TCP_KEEPCNT: return tcp_sock_set_keepcnt(sk, val); case TCP_LINGER2: if (val < 0) WRITE_ONCE(tp->linger2, -1); else if (val > TCP_FIN_TIMEOUT_MAX / HZ) WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); else WRITE_ONCE(tp->linger2, val * HZ); return 0; case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ)); return 0; } sockopt_lock_sock(sk); switch (optname) { case TCP_MAXSEG: /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { err = -EINVAL; break; } tp->rx_opt.user_mss = val; break; case TCP_NODELAY: __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: if (val < 0 || val > 1) err = -EINVAL; else tp->thin_lto = val; break; case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; break; case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; else if (val == TCP_REPAIR_ON) { tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; } else if (val == TCP_REPAIR_OFF) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; tcp_send_window_probe(sk); } else if (val == TCP_REPAIR_OFF_NO_WP) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; } else err = -EINVAL; break; case TCP_REPAIR_QUEUE: if (!tp->repair) err = -EPERM; else if ((unsigned int)val < TCP_QUEUES_NR) tp->repair_queue = val; else err = -EINVAL; break; case TCP_QUEUE_SEQ: if (sk->sk_state != TCP_CLOSE) { err = -EPERM; } else if (tp->repair_queue == TCP_SEND_QUEUE) { if (!tcp_rtx_queue_empty(sk)) err = -EPERM; else WRITE_ONCE(tp->write_seq, val); } else if (tp->repair_queue == TCP_RECV_QUEUE) { if (tp->rcv_nxt != tp->copied_seq) { err = -EPERM; } else { WRITE_ONCE(tp->rcv_nxt, val); WRITE_ONCE(tp->copied_seq, val); } } else { err = -EINVAL; } break; case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; break; case TCP_CORK: __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; break; case TCP_WINDOW_CLAMP: err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: __tcp_sock_set_quickack(sk, val); break; case TCP_AO_REPAIR: if (!tcp_can_repair_sock(sk)) { err = -EPERM; break; } err = tcp_ao_set_repair(sk, optval, optlen); break; #ifdef CONFIG_TCP_AO case TCP_AO_ADD_KEY: case TCP_AO_DEL_KEY: case TCP_AO_INFO: { /* If this is the first TCP-AO setsockopt() on the socket, * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR * in any state. */ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) goto ao_parse; if (rcu_dereference_protected(tcp_sk(sk)->ao_info, lockdep_sock_is_held(sk))) goto ao_parse; if (tp->repair) goto ao_parse; err = -EISCONN; break; ao_parse: err = tp->af_specific->ao_parse(sk, optname, optval, optlen); break; } #endif #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { err = -EINVAL; } break; case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else err = -EINVAL; } else { err = -EOPNOTSUPP; } break; case TCP_FASTOPEN_NO_COOKIE: if (val > 1 || val < 0) err = -EINVAL; else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) err = -EINVAL; else tp->fastopen_no_cookie = val; break; case TCP_TIMESTAMP: if (!tp->repair) { err = -EPERM; break; } /* val is an opaque field, * and low order bit contains usec_ts enable bit. * Its a best effort, and we do not care if user makes an error. */ tp->tcp_usec_ts = val & 1; WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts)); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); break; case TCP_NOTSENT_LOWAT: WRITE_ONCE(tp->notsent_lowat, val); sk->sk_write_space(sk); break; case TCP_INQ: if (val > 1 || val < 0) err = -EINVAL; else tp->recvmsg_inq = val; break; case TCP_TX_DELAY: if (val) tcp_enable_tx_delay(); WRITE_ONCE(tp->tcp_tx_delay, val); break; default: err = -ENOPROTOOPT; break; } sockopt_release_sock(sk); return err; } int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_setsockopt); static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) { u64 stats[__TCP_CHRONO_MAX], total = 0; enum tcp_chrono i; for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { stats[i] = tp->chrono_stat[i - 1]; if (i == tp->chrono_type) stats[i] += tcp_jiffies32 - tp->chrono_start; stats[i] *= USEC_PER_SEC / HZ; total += stats[i]; } info->tcpi_busy_time = total; info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; } /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); unsigned long rate; u32 now; u64 rate64; bool slow; memset(info, 0, sizeof(*info)); if (sk->sk_type != SOCK_STREAM) return; info->tcpi_state = inet_sk_state_load(sk); /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_pacing_rate = rate64; rate = READ_ONCE(sk->sk_max_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_max_pacing_rate = rate64; info->tcpi_reordering = tp->reordering; info->tcpi_snd_cwnd = tcp_snd_cwnd(tp); if (info->tcpi_state == TCP_LISTEN) { /* listeners aliased fields : * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } slow = lock_sock_fast(sk); info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tcp_is_sack(tp)) info->tcpi_options |= TCPI_OPT_SACK; if (tp->rx_opt.wscale_ok) { info->tcpi_options |= TCPI_OPT_WSCALE; info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } if (tp->ecn_flags & TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; if (tp->tcp_usec_ts) info->tcpi_options |= TCPI_OPT_USEC_TS; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, tcp_delack_max(sk))); info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_unacked = tp->packets_out; info->tcpi_sacked = tp->sacked_out; info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = tp->srtt_us >> 3; info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_advmss = tp->advmss; info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); tcp_get_info_chrono_stats(tp, info); info->tcpi_segs_out = tp->segs_out; /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */ info->tcpi_segs_in = READ_ONCE(tp->segs_in); info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in); info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; rate64 = tcp_compute_delivery_rate(tp); if (rate64) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; info->tcpi_rcv_wnd = tp->rcv_wnd; info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; info->tcpi_total_rto = tp->total_rto; info->tcpi_total_rto_recoveries = tp->total_rto_recoveries; info->tcpi_total_rto_time = tp->total_rto_time; if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); static size_t tcp_opt_stats_get_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } /* Returns TTL or hop limit of an incoming packet from skb. */ static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return ip_hdr(skb)->ttl; else if (skb->protocol == htons(ETH_P_IPV6)) return ipv6_hdr(skb)->hop_limit; else return 0; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb, const struct sk_buff *ack_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; unsigned long rate; u64 rate64; stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) return NULL; tcp_get_info_chrono_stats(tp, &info); nla_put_u64_64bit(stats, TCP_NLA_BUSY, info.tcpi_busy_time, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, info.tcpi_rwnd_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, info.tcpi_sndbuf_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); rate64 = tcp_compute_delivery_rate(tp); nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp)); nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); if (ack_skb) nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } int do_tcp_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); int val, len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); if (len < 0) return -EINVAL; switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; if (tp->rx_opt.user_mss && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; break; case TCP_NODELAY: val = !!(tp->nonagle&TCP_NAGLE_OFF); break; case TCP_CORK: val = !!(tp->nonagle&TCP_NAGLE_CORK); break; case TCP_KEEPIDLE: val = keepalive_time_when(tp) / HZ; break; case TCP_KEEPINTVL: val = keepalive_intvl_when(tp) / HZ; break; case TCP_KEEPCNT: val = keepalive_probes(tp); break; case TCP_SYNCNT: val = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: val = READ_ONCE(tp->linger2); if (val >= 0) val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; break; case TCP_INFO: { struct tcp_info info; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; tcp_get_info(sk, &info); len = min_t(unsigned int, len, sizeof(info)); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_CC_INFO: { const struct tcp_congestion_ops *ca_ops; union tcp_cc_info info; size_t sz = 0; int attr; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; ca_ops = icsk->icsk_ca_ops; if (ca_ops && ca_ops->get_info) sz = ca_ops->get_info(sk, ~0U, &attr, &info); len = min_t(unsigned int, len, sz); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_QUICKACK: val = !inet_csk_in_pingpong_mode(sk); break; case TCP_CONGESTION: if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; case TCP_ULP: if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); if (!icsk->icsk_ulp_ops) { len = 0; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len)) return -EFAULT; return 0; case TCP_FASTOPEN_KEY: { u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; unsigned int key_len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; key_len = tcp_fastopen_get_cipher(net, icsk, key) * TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, key, len)) return -EFAULT; return 0; } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; case TCP_THIN_DUPACK: val = 0; break; case TCP_REPAIR: val = tp->repair; break; case TCP_REPAIR_QUEUE: if (tp->repair) val = tp->repair_queue; else return -EINVAL; break; case TCP_REPAIR_WINDOW: { struct tcp_repair_window opt; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len != sizeof(opt)) return -EINVAL; if (!tp->repair) return -EPERM; opt.snd_wl1 = tp->snd_wl1; opt.snd_wnd = tp->snd_wnd; opt.max_window = tp->max_window; opt.rcv_wnd = tp->rcv_wnd; opt.rcv_wup = tp->rcv_wup; if (copy_to_sockptr(optval, &opt, len)) return -EFAULT; return 0; } case TCP_QUEUE_SEQ: if (tp->repair_queue == TCP_SEND_QUEUE) val = tp->write_seq; else if (tp->repair_queue == TCP_RECV_QUEUE) val = tp->rcv_nxt; else return -EINVAL; break; case TCP_USER_TIMEOUT: val = READ_ONCE(icsk->icsk_user_timeout); break; case TCP_FASTOPEN: val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); break; case TCP_FASTOPEN_CONNECT: val = tp->fastopen_connect; break; case TCP_FASTOPEN_NO_COOKIE: val = tp->fastopen_no_cookie; break; case TCP_TX_DELAY: val = READ_ONCE(tp->tcp_tx_delay); break; case TCP_TIMESTAMP: val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset); if (tp->tcp_usec_ts) val |= 1; else val &= ~1; break; case TCP_NOTSENT_LOWAT: val = READ_ONCE(tp->notsent_lowat); break; case TCP_INQ: val = tp->recvmsg_inq; break; case TCP_SAVE_SYN: val = tp->save_syn; break; case TCP_SAVED_SYN: { if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; sockopt_lock_sock(sk); if (tp->saved_syn) { if (len < tcp_saved_syn_len(tp->saved_syn)) { len = tcp_saved_syn_len(tp->saved_syn); if (copy_to_sockptr(optlen, &len, sizeof(int))) { sockopt_release_sock(sk); return -EFAULT; } sockopt_release_sock(sk); return -EINVAL; } len = tcp_saved_syn_len(tp->saved_syn); if (copy_to_sockptr(optlen, &len, sizeof(int))) { sockopt_release_sock(sk); return -EFAULT; } if (copy_to_sockptr(optval, tp->saved_syn->data, len)) { sockopt_release_sock(sk); return -EFAULT; } tcp_saved_syn_free(tp); sockopt_release_sock(sk); } else { sockopt_release_sock(sk); len = 0; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } return 0; } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; int err; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; if (unlikely(len > sizeof(zc))) { err = check_zeroed_sockptr(optval, sizeof(zc), len - sizeof(zc)); if (err < 1) return err == 0 ? -EINVAL : err; len = sizeof(zc); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } if (copy_from_sockptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) return -EINVAL; sockopt_lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); sockopt_release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { case offsetofend(struct tcp_zerocopy_receive, msg_flags): goto zerocopy_rcv_cmsg; case offsetofend(struct tcp_zerocopy_receive, msg_controllen): case offsetofend(struct tcp_zerocopy_receive, msg_control): case offsetofend(struct tcp_zerocopy_receive, flags): case offsetofend(struct tcp_zerocopy_receive, copybuf_len): case offsetofend(struct tcp_zerocopy_receive, copybuf_address): case offsetofend(struct tcp_zerocopy_receive, err): goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): goto zerocopy_rcv_inq; case offsetofend(struct tcp_zerocopy_receive, length): default: goto zerocopy_rcv_out; } zerocopy_rcv_cmsg: if (zc.msg_flags & TCP_CMSG_TS) tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); else zc.msg_flags = 0; zerocopy_rcv_sk_err: if (!err) zc.err = sock_error(sk); zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: if (!err && copy_to_sockptr(optval, &zc, len)) err = -EFAULT; return err; } #endif case TCP_AO_REPAIR: if (!tcp_can_repair_sock(sk)) return -EPERM; return tcp_ao_get_repair(sk, optval, optlen); case TCP_AO_GET_KEYS: case TCP_AO_INFO: { int err; sockopt_lock_sock(sk); if (optname == TCP_AO_GET_KEYS) err = tcp_ao_get_mkts(sk, optval, optlen); else err = tcp_ao_get_sock_info(sk, optval, optlen); sockopt_release_sock(sk); return err; } default: return -ENOPROTOOPT; } if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } bool tcp_bpf_bypass_getsockopt(int level, int optname) { /* TCP do_tcp_getsockopt has optimized getsockopt implementation * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. */ if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) return true; return false; } EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, optval, optlen); return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } EXPORT_SYMBOL(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG int tcp_md5_sigpool_id = -1; EXPORT_SYMBOL_GPL(tcp_md5_sigpool_id); int tcp_md5_alloc_sigpool(void) { size_t scratch_size; int ret; scratch_size = sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr); ret = tcp_sigpool_alloc_ahash("md5", scratch_size); if (ret >= 0) { /* As long as any md5 sigpool was allocated, the return * id would stay the same. Re-write the id only for the case * when previously all MD5 keys were deleted and this call * allocates the first MD5 key, which may return a different * sigpool id than was used previously. */ WRITE_ONCE(tcp_md5_sigpool_id, ret); /* Avoids the compiler potentially being smart here */ return 0; } return ret; } void tcp_md5_release_sigpool(void) { tcp_sigpool_release(READ_ONCE(tcp_md5_sigpool_id)); } void tcp_md5_add_sigpool(void) { tcp_sigpool_get(READ_ONCE(tcp_md5_sigpool_id)); } int tcp_md5_hash_key(struct tcp_sigpool *hp, const struct tcp_md5sig_key *key) { u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ struct scatterlist sg; sg_init_one(&sg, key->key, keylen); ahash_request_set_crypt(hp->req, &sg, NULL, keylen); /* We use data_race() because tcp_md5_do_add() might change * key->key under us */ return data_race(crypto_ahash_update(hp->req)); } EXPORT_SYMBOL(tcp_md5_hash_key); /* Called with rcu_read_lock() */ enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int l3index, const __u8 *hash_location) { /* This gets called for each TCP segment that has TCP-MD5 option. * We have 3 drop cases: * o No MD5 hash and one expected. * o MD5 hash and we're not expecting one. * o MD5 hash and its wrong. */ const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; u8 newhash[16]; int genhash; key = tcp_md5_do_lookup(sk, l3index, saddr, family); if (!key && hash_location) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); tcp_hash_fail("Unexpected MD5 Hash found", family, skb, ""); return SKB_DROP_REASON_TCP_MD5UNEXPECTED; } /* Check the signature. * To support dual stack listeners, we need to handle * IPv4-mapped case. */ if (family == AF_INET) genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); else genhash = tp->af_specific->calc_md5_hash(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); if (family == AF_INET) { tcp_hash_fail("MD5 Hash failed", AF_INET, skb, "%s L3 index %d", genhash ? "tcp_v4_calc_md5_hash failed" : "", l3index); } else { if (genhash) { tcp_hash_fail("MD5 Hash failed", AF_INET6, skb, "L3 index %d", l3index); } else { tcp_hash_fail("MD5 Hash mismatch", AF_INET6, skb, "L3 index %d", l3index); } } return SKB_DROP_REASON_TCP_MD5FAILURE; } return SKB_NOT_DROPPED_YET; } EXPORT_SYMBOL(tcp_inbound_md5_hash); #endif void tcp_done(struct sock *sk) { struct request_sock *req; /* We might be called with a new socket, after * inet_csk_prepare_forced_close() has been called * so we can not use lockdep_sock_is_held(sk) */ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); if (req) reqsk_fastopen_remove(sk, req, false); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(tcp_done); int tcp_abort(struct sock *sk, int err) { int state = inet_sk_state_load(sk); if (state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); local_bh_disable(); inet_csk_reqsk_queue_drop(req->rsk_listener, req); local_bh_enable(); return 0; } if (state == TCP_TIME_WAIT) { struct inet_timewait_sock *tw = inet_twsk(sk); refcount_inc(&tw->tw_refcnt); local_bh_disable(); inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; } /* BPF context ensures sock locking. */ if (!has_current_bpf_ctx()) /* Don't race with userspace socket closes such as tcp_close. */ lock_sock(sk); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); inet_csk_listen_stop(sk); } /* Don't race with BH socket closes such as inet_csk_listen_stop. */ local_bh_disable(); bh_lock_sock(sk); if (!sock_flag(sk, SOCK_DEAD)) { WRITE_ONCE(sk->sk_err, err); /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); } bh_unlock_sock(sk); local_bh_enable(); tcp_write_queue_purge(sk); if (!has_current_bpf_ctx()) release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(tcp_abort); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; static int __init set_thash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtoul(str, 0, &thash_entries); if (ret) return 0; return 1; } __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { unsigned long limit = nr_free_buffer_pages() / 16; limit = max(limit, 128UL); sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ sysctl_tcp_mem[1] = limit; /* 6.25 % */ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } static void __init tcp_struct_check(void) { /* TX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40); /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 32); /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69); /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 113); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99); } void __init tcp_init(void) { int max_rshare, max_wshare, cnt; unsigned long limit; unsigned int i; BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof_field(struct sk_buff, cb)); tcp_struct_check(); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); tcp_hashinfo.bind2_bucket_cachep = kmem_cache_create("tcp_bind2_bucket", sizeof(struct inet_bind2_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); /* Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, 17, /* one slot per 128 KB of memory */ 0, NULL, &tcp_hashinfo.ehash_mask, 0, thash_entries ? 0 : 512 * 1024); for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", 2 * sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, 17, /* one slot per 128 KB of memory */ 0, &tcp_hashinfo.bhash_size, NULL, 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); spin_lock_init(&tcp_hashinfo.bhash2[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } tcp_hashinfo.pernet = false; cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_rmem[1] = 131072; init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_v4_init(); tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); mptcp_init(); }
16 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/module.h> #include <net/ip.h> #include <linux/ipv6.h> #include <linux/icmp.h> #include <net/ipv6.h> #include <net/tcp.h> #include <net/udp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_DESCRIPTION("Xtables: TCP, UDP and UDP-Lite match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("xt_tcp"); MODULE_ALIAS("xt_udp"); MODULE_ALIAS("ipt_udp"); MODULE_ALIAS("ipt_tcp"); MODULE_ALIAS("ip6t_udp"); MODULE_ALIAS("ip6t_tcp"); MODULE_ALIAS("ipt_icmp"); MODULE_ALIAS("ip6t_icmp6"); /* Returns 1 if the port is matched by the range, 0 otherwise */ static inline bool port_match(u_int16_t min, u_int16_t max, u_int16_t port, bool invert) { return (port >= min && port <= max) ^ invert; } static bool tcp_find_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, unsigned int optlen, bool invert, bool *hotdrop) { /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ const u_int8_t *op; u_int8_t _opt[60 - sizeof(struct tcphdr)]; unsigned int i; pr_debug("finding option\n"); if (!optlen) return invert; /* If we don't have the whole header, drop packet. */ op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr), optlen, _opt); if (op == NULL) { *hotdrop = true; return false; } for (i = 0; i < optlen; ) { if (op[i] == option) return !invert; if (op[i] < 2) i++; else i += op[i+1]?:1; } return invert; } static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct tcphdr *th; struct tcphdr _tcph; const struct xt_tcp *tcpinfo = par->matchinfo; if (par->fragoff != 0) { /* To quote Alan: Don't allow a fragment of TCP 8 bytes in. Nobody normal causes this. Its a cracker trying to break in by doing a flag overwrite to pass the direction checks. */ if (par->fragoff == 1) { pr_debug("Dropping evil TCP offset=1 frag.\n"); par->hotdrop = true; } /* Must not be a fragment. */ return false; } th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ pr_debug("Dropping evil TCP offset=0 tinygram.\n"); par->hotdrop = true; return false; } if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], ntohs(th->source), !!(tcpinfo->invflags & XT_TCP_INV_SRCPT))) return false; if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], ntohs(th->dest), !!(tcpinfo->invflags & XT_TCP_INV_DSTPT))) return false; if (!NF_INVF(tcpinfo, XT_TCP_INV_FLAGS, (((unsigned char *)th)[13] & tcpinfo->flg_mask) == tcpinfo->flg_cmp)) return false; if (tcpinfo->option) { if (th->doff * 4 < sizeof(_tcph)) { par->hotdrop = true; return false; } if (!tcp_find_option(tcpinfo->option, skb, par->thoff, th->doff*4 - sizeof(_tcph), tcpinfo->invflags & XT_TCP_INV_OPTION, &par->hotdrop)) return false; } return true; } static int tcp_mt_check(const struct xt_mtchk_param *par) { const struct xt_tcp *tcpinfo = par->matchinfo; /* Must specify no unknown invflags */ return (tcpinfo->invflags & ~XT_TCP_INV_MASK) ? -EINVAL : 0; } static bool udp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct udphdr *uh; struct udphdr _udph; const struct xt_udp *udpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph); if (uh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ pr_debug("Dropping evil UDP tinygram.\n"); par->hotdrop = true; return false; } return port_match(udpinfo->spts[0], udpinfo->spts[1], ntohs(uh->source), !!(udpinfo->invflags & XT_UDP_INV_SRCPT)) && port_match(udpinfo->dpts[0], udpinfo->dpts[1], ntohs(uh->dest), !!(udpinfo->invflags & XT_UDP_INV_DSTPT)); } static int udp_mt_check(const struct xt_mtchk_param *par) { const struct xt_udp *udpinfo = par->matchinfo; /* Must specify no unknown invflags */ return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0; } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ static bool type_code_in_range(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code) { return type == test_type && code >= min_code && code <= max_code; } static bool icmp_type_code_match(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code, bool invert) { return (test_type == 0xFF || type_code_in_range(test_type, min_code, max_code, type, code)) ^ invert; } static bool icmp6_type_code_match(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code, bool invert) { return type_code_in_range(test_type, min_code, max_code, type, code) ^ invert; } static bool icmp_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmphdr *ic; struct icmphdr _icmph; const struct ipt_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (!ic) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ par->hotdrop = true; return false; } return icmp_type_code_match(icmpinfo->type, icmpinfo->code[0], icmpinfo->code[1], ic->type, ic->code, !!(icmpinfo->invflags & IPT_ICMP_INV)); } static bool icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmp6hdr *ic; struct icmp6hdr _icmph; const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (!ic) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ par->hotdrop = true; return false; } return icmp6_type_code_match(icmpinfo->type, icmpinfo->code[0], icmpinfo->code[1], ic->icmp6_type, ic->icmp6_code, !!(icmpinfo->invflags & IP6T_ICMP_INV)); } static int icmp_checkentry(const struct xt_mtchk_param *par) { const struct ipt_icmp *icmpinfo = par->matchinfo; return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0; } static int icmp6_checkentry(const struct xt_mtchk_param *par) { const struct ip6t_icmp *icmpinfo = par->matchinfo; return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; } static struct xt_match tcpudp_mt_reg[] __read_mostly = { { .name = "tcp", .family = NFPROTO_IPV4, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "tcp", .family = NFPROTO_IPV6, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "udp", .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, }, { .name = "udp", .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, }, { .name = "udplite", .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, { .name = "udplite", .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, { .name = "icmp", .match = icmp_match, .matchsize = sizeof(struct ipt_icmp), .checkentry = icmp_checkentry, .proto = IPPROTO_ICMP, .family = NFPROTO_IPV4, .me = THIS_MODULE, }, { .name = "icmp6", .match = icmp6_match, .matchsize = sizeof(struct ip6t_icmp), .checkentry = icmp6_checkentry, .proto = IPPROTO_ICMPV6, .family = NFPROTO_IPV6, .me = THIS_MODULE, }, }; static int __init tcpudp_mt_init(void) { return xt_register_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } static void __exit tcpudp_mt_exit(void) { xt_unregister_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } module_init(tcpudp_mt_init); module_exit(tcpudp_mt_exit);
271 55 55 309 143 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 #undef TRACE_SYSTEM #define TRACE_SYSTEM neigh #if !defined(_TRACE_NEIGH_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NEIGH_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <net/neighbour.h> #define neigh_state_str(state) \ __print_symbolic(state, \ { NUD_INCOMPLETE, "incomplete" }, \ { NUD_REACHABLE, "reachable" }, \ { NUD_STALE, "stale" }, \ { NUD_DELAY, "delay" }, \ { NUD_PROBE, "probe" }, \ { NUD_FAILED, "failed" }, \ { NUD_NOARP, "noarp" }, \ { NUD_PERMANENT, "permanent"}) TRACE_EVENT(neigh_create, TP_PROTO(struct neigh_table *tbl, struct net_device *dev, const void *pkey, const struct neighbour *n, bool exempt_from_gc), TP_ARGS(tbl, dev, pkey, n, exempt_from_gc), TP_STRUCT__entry( __field(u32, family) __string(dev, dev ? dev->name : "NULL") __field(int, entries) __field(u8, created) __field(u8, gc_exempt) __array(u8, primary_key4, 4) __array(u8, primary_key6, 16) ), TP_fast_assign( __be32 *p32; __entry->family = tbl->family; __assign_str(dev, (dev ? dev->name : "NULL")); __entry->entries = atomic_read(&tbl->gc_entries); __entry->created = n != NULL; __entry->gc_exempt = exempt_from_gc; p32 = (__be32 *)__entry->primary_key4; if (tbl->family == AF_INET) *p32 = *(__be32 *)pkey; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (tbl->family == AF_INET6) { struct in6_addr *pin6; pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)pkey; } #endif ), TP_printk("family %d dev %s entries %d primary_key4 %pI4 primary_key6 %pI6c created %d gc_exempt %d", __entry->family, __get_str(dev), __entry->entries, __entry->primary_key4, __entry->primary_key6, __entry->created, __entry->gc_exempt) ); TRACE_EVENT(neigh_update, TP_PROTO(struct neighbour *n, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid), TP_ARGS(n, lladdr, new, flags, nlmsg_pid), TP_STRUCT__entry( __field(u32, family) __string(dev, (n->dev ? n->dev->name : "NULL")) __array(u8, lladdr, MAX_ADDR_LEN) __field(u8, lladdr_len) __field(u8, flags) __field(u8, nud_state) __field(u8, type) __field(u8, dead) __field(int, refcnt) __array(__u8, primary_key4, 4) __array(__u8, primary_key6, 16) __field(unsigned long, confirmed) __field(unsigned long, updated) __field(unsigned long, used) __array(u8, new_lladdr, MAX_ADDR_LEN) __field(u8, new_state) __field(u32, update_flags) __field(u32, pid) ), TP_fast_assign( int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN); struct in6_addr *pin6; __be32 *p32; __entry->family = n->tbl->family; __assign_str(dev, (n->dev ? n->dev->name : "NULL")); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; __entry->nud_state = n->nud_state; __entry->type = n->type; __entry->dead = n->dead; __entry->refcnt = refcount_read(&n->refcnt); pin6 = (struct in6_addr *)__entry->primary_key6; p32 = (__be32 *)__entry->primary_key4; if (n->tbl->family == AF_INET) *p32 = *(__be32 *)n->primary_key; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (n->tbl->family == AF_INET6) { pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)n->primary_key; } else #endif { ipv6_addr_set_v4mapped(*p32, pin6); } __entry->confirmed = n->confirmed; __entry->updated = n->updated; __entry->used = n->used; if (lladdr) memcpy(__entry->new_lladdr, lladdr, lladdr_len); __entry->new_state = new; __entry->update_flags = flags; __entry->pid = nlmsg_pid; ), TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x " "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c " "confirmed %lu updated %lu used %lu new_lladdr %s " "new_state %s update_flags %02x pid %d", __entry->family, __get_str(dev), __print_hex_str(__entry->lladdr, __entry->lladdr_len), __entry->flags, neigh_state_str(__entry->nud_state), __entry->type, __entry->dead, __entry->refcnt, __entry->primary_key4, __entry->primary_key6, __entry->confirmed, __entry->updated, __entry->used, __print_hex_str(__entry->new_lladdr, __entry->lladdr_len), neigh_state_str(__entry->new_state), __entry->update_flags, __entry->pid) ); DECLARE_EVENT_CLASS(neigh__update, TP_PROTO(struct neighbour *n, int err), TP_ARGS(n, err), TP_STRUCT__entry( __field(u32, family) __string(dev, (n->dev ? n->dev->name : "NULL")) __array(u8, lladdr, MAX_ADDR_LEN) __field(u8, lladdr_len) __field(u8, flags) __field(u8, nud_state) __field(u8, type) __field(u8, dead) __field(int, refcnt) __array(__u8, primary_key4, 4) __array(__u8, primary_key6, 16) __field(unsigned long, confirmed) __field(unsigned long, updated) __field(unsigned long, used) __field(u32, err) ), TP_fast_assign( int lladdr_len = (n->dev ? n->dev->addr_len : MAX_ADDR_LEN); struct in6_addr *pin6; __be32 *p32; __entry->family = n->tbl->family; __assign_str(dev, (n->dev ? n->dev->name : "NULL")); __entry->lladdr_len = lladdr_len; memcpy(__entry->lladdr, n->ha, lladdr_len); __entry->flags = n->flags; __entry->nud_state = n->nud_state; __entry->type = n->type; __entry->dead = n->dead; __entry->refcnt = refcount_read(&n->refcnt); pin6 = (struct in6_addr *)__entry->primary_key6; p32 = (__be32 *)__entry->primary_key4; if (n->tbl->family == AF_INET) *p32 = *(__be32 *)n->primary_key; else *p32 = 0; #if IS_ENABLED(CONFIG_IPV6) if (n->tbl->family == AF_INET6) { pin6 = (struct in6_addr *)__entry->primary_key6; *pin6 = *(struct in6_addr *)n->primary_key; } else #endif { ipv6_addr_set_v4mapped(*p32, pin6); } __entry->confirmed = n->confirmed; __entry->updated = n->updated; __entry->used = n->used; __entry->err = err; ), TP_printk("family %d dev %s lladdr %s flags %02x nud_state %s type %02x " "dead %d refcnt %d primary_key4 %pI4 primary_key6 %pI6c " "confirmed %lu updated %lu used %lu err %d", __entry->family, __get_str(dev), __print_hex_str(__entry->lladdr, __entry->lladdr_len), __entry->flags, neigh_state_str(__entry->nud_state), __entry->type, __entry->dead, __entry->refcnt, __entry->primary_key4, __entry->primary_key6, __entry->confirmed, __entry->updated, __entry->used, __entry->err) ); DEFINE_EVENT(neigh__update, neigh_update_done, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_timer_handler, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_event_send_done, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_event_send_dead, TP_PROTO(struct neighbour *neigh, int err), TP_ARGS(neigh, err) ); DEFINE_EVENT(neigh__update, neigh_cleanup_and_release, TP_PROTO(struct neighbour *neigh, int rc), TP_ARGS(neigh, rc) ); #endif /* _TRACE_NEIGH_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
14 1 13 22 2 1 1 7 2 3 1 9 2 4 1 2 2 1 1 1 1 1 1 4 1 1 1 1 2 4 1 1 1 1 38 3 1 1 1 32 12 2 1 2 1 1 1 1 1 1 1 9 8 8 1 1 1 438 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/rculist.h> #include <linux/rculist_nulls.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/security.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/netfilter.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static unsigned int nfct_timeout_id __read_mostly; struct ctnl_timeout { struct list_head head; struct list_head free_head; struct rcu_head rcu_head; refcount_t refcnt; char name[CTNL_TIMEOUT_NAME_MAX]; /* must be at the end */ struct nf_ct_timeout timeout; }; struct nfct_timeout_pernet { struct list_head nfct_timeout_list; struct list_head nfct_timeout_freelist; }; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING, .len = CTNL_TIMEOUT_NAME_MAX - 1}, [CTA_TIMEOUT_L3PROTO] = { .type = NLA_U16 }, [CTA_TIMEOUT_L4PROTO] = { .type = NLA_U8 }, [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED }, }; static struct nfct_timeout_pernet *nfct_timeout_pernet(struct net *net) { return net_generic(net, nfct_timeout_id); } static int ctnl_timeout_parse_policy(void *timeout, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { struct nlattr **tb; int ret = 0; tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); if (!tb) return -ENOMEM; ret = nla_parse_nested_deprecated(tb, l4proto->ctnl_timeout.nlattr_max, attr, l4proto->ctnl_timeout.nla_policy, NULL); if (ret < 0) goto err; ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout); err: kfree(tb); return ret; } static int cttimeout_new_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); __u16 l3num; __u8 l4num; const struct nf_conntrack_l4proto *l4proto; struct ctnl_timeout *timeout, *matching = NULL; char *name; int ret; if (!cda[CTA_TIMEOUT_NAME] || !cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); list_for_each_entry(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = timeout; break; } if (matching) { if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* You cannot replace one timeout policy by another of * different kind, sorry. */ if (matching->timeout.l3num != l3num || matching->timeout.l4proto->l4proto != l4num) return -EINVAL; return ctnl_timeout_parse_policy(&matching->timeout.data, matching->timeout.l4proto, info->net, cda[CTA_TIMEOUT_DATA]); } return -EBUSY; } l4proto = nf_ct_l4proto_find(l4num); /* This protocol is not supportted, skip. */ if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err_proto_put; } timeout = kzalloc(sizeof(struct ctnl_timeout) + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); if (timeout == NULL) { ret = -ENOMEM; goto err_proto_put; } ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); timeout->timeout.l3num = l3num; timeout->timeout.l4proto = l4proto; refcount_set(&timeout->refcnt, 1); __module_get(THIS_MODULE); list_add_tail_rcu(&timeout->head, &pernet->nfct_timeout_list); return 0; err: kfree(timeout); err_proto_put: return ret; } static int ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct ctnl_timeout *timeout) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->timeout.l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto) || nla_put_be32(skb, CTA_TIMEOUT_USE, htonl(refcount_read(&timeout->refcnt)))) goto nla_put_failure; nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA); if (!nest_parms) goto nla_put_failure; ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->timeout.data); if (ret < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nfct_timeout_pernet *pernet; struct net *net = sock_net(skb->sk); struct ctnl_timeout *cur, *last; if (cb->args[2]) return 0; last = (struct ctnl_timeout *)cb->args[1]; if (cb->args[1]) cb->args[1] = 0; rcu_read_lock(); pernet = nfct_timeout_pernet(net); list_for_each_entry_rcu(cur, &pernet->nfct_timeout_list, head) { if (last) { if (cur != last) continue; last = NULL; } if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) { cb->args[1] = (unsigned long)cur; break; } } if (!cb->args[1]) cb->args[2] = 1; rcu_read_unlock(); return skb->len; } static int cttimeout_get_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); int ret = -ENOENT; char *name; struct ctnl_timeout *cur; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnl_timeout_dump, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!cda[CTA_TIMEOUT_NAME]) return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; break; } ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); break; } return ret; } /* try to delete object, fail if it is still in use. */ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) { int ret = 0; /* We want to avoid races with ctnl_timeout_put. So only when the * current refcnt is 1, we decrease it to 0. */ if (refcount_dec_if_one(&timeout->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_untimeout(net, &timeout->timeout); kfree_rcu(timeout, rcu_head); } else { ret = -EBUSY; } return ret; } static int cttimeout_del_timeout(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; char *name; if (!cda[CTA_TIMEOUT_NAME]) { list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) ctnl_timeout_try_del(info->net, cur); return 0; } name = nla_data(cda[CTA_TIMEOUT_NAME]); list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; ret = ctnl_timeout_try_del(info->net, cur); if (ret < 0) return ret; break; } return ret; } static int cttimeout_default_set(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; __u8 l4num; int ret; if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO] || !cda[CTA_TIMEOUT_DATA]) return -EINVAL; l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); /* This protocol is not supported, skip. */ if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err; } ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; return 0; err: return ret; } static int cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, u16 l3num, const struct nf_conntrack_l4proto *l4proto, const unsigned int *timeouts) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA); if (!nest_parms) goto nla_put_failure; ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts); if (ret < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int cttimeout_default_get(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; struct sk_buff *skb2; __u16 l3num; __u8 l4num; int ret; if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num); if (l4proto->l4proto != l4num) return -EOPNOTSUPP; switch (l4proto->l4proto) { case IPPROTO_ICMP: timeouts = &nf_icmp_pernet(info->net)->timeout; break; case IPPROTO_TCP: timeouts = nf_tcp_pernet(info->net)->timeouts; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(info->net)->timeouts; break; case IPPROTO_DCCP: #ifdef CONFIG_NF_CT_PROTO_DCCP timeouts = nf_dccp_pernet(info->net)->dccp_timeout; #endif break; case IPPROTO_ICMPV6: timeouts = &nf_icmpv6_pernet(info->net)->timeout; break; case IPPROTO_SCTP: #ifdef CONFIG_NF_CT_PROTO_SCTP timeouts = nf_sctp_pernet(info->net)->timeouts; #endif break; case IPPROTO_GRE: #ifdef CONFIG_NF_CT_PROTO_GRE timeouts = nf_gre_pernet(info->net)->timeouts; #endif break; case 255: timeouts = &nf_generic_pernet(info->net)->timeout; break; default: WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); break; } if (!timeouts) return -EOPNOTSUPP; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) return -ENOMEM; ret = cttimeout_default_fill_info(info->net, skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net, const char *name) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *timeout, *matching = NULL; list_for_each_entry_rcu(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; if (!refcount_inc_not_zero(&timeout->refcnt)) goto err; matching = timeout; break; } err: return matching ? &matching->timeout : NULL; } static void ctnl_timeout_put(struct nf_ct_timeout *t) { struct ctnl_timeout *timeout = container_of(t, struct ctnl_timeout, timeout); if (refcount_dec_and_test(&timeout->refcnt)) { kfree_rcu(timeout, rcu_head); module_put(THIS_MODULE); } } static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = { .call = cttimeout_default_set, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, [IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = { .call = cttimeout_default_get, .type = NFNL_CB_MUTEX, .attr_count = CTA_TIMEOUT_MAX, .policy = cttimeout_nla_policy }, }; static const struct nfnetlink_subsystem cttimeout_subsys = { .name = "conntrack_timeout", .subsys_id = NFNL_SUBSYS_CTNETLINK_TIMEOUT, .cb_count = IPCTNL_MSG_TIMEOUT_MAX, .cb = cttimeout_cb, }; MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); static int __net_init cttimeout_net_init(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); INIT_LIST_HEAD(&pernet->nfct_timeout_list); INIT_LIST_HEAD(&pernet->nfct_timeout_freelist); return 0; } static void __net_exit cttimeout_net_pre_exit(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) { list_del_rcu(&cur->head); list_add(&cur->free_head, &pernet->nfct_timeout_freelist); } /* core calls synchronize_rcu() after this */ } static void __net_exit cttimeout_net_exit(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; if (list_empty(&pernet->nfct_timeout_freelist)) return; nf_ct_untimeout(net, NULL); list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, free_head) { list_del(&cur->free_head); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); } } static struct pernet_operations cttimeout_ops = { .init = cttimeout_net_init, .pre_exit = cttimeout_net_pre_exit, .exit = cttimeout_net_exit, .id = &nfct_timeout_id, .size = sizeof(struct nfct_timeout_pernet), }; static const struct nf_ct_timeout_hooks hooks = { .timeout_find_get = ctnl_timeout_find_get, .timeout_put = ctnl_timeout_put, }; static int __init cttimeout_init(void) { int ret; ret = register_pernet_subsys(&cttimeout_ops); if (ret < 0) return ret; ret = nfnetlink_subsys_register(&cttimeout_subsys); if (ret < 0) { pr_err("cttimeout_init: cannot register cttimeout with " "nfnetlink.\n"); goto err_out; } RCU_INIT_POINTER(nf_ct_timeout_hook, &hooks); return 0; err_out: unregister_pernet_subsys(&cttimeout_ops); return ret; } static int untimeout(struct nf_conn *ct, void *timeout) { struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) RCU_INIT_POINTER(timeout_ext->timeout, NULL); return 0; } static void __exit cttimeout_exit(void) { nfnetlink_subsys_unregister(&cttimeout_subsys); unregister_pernet_subsys(&cttimeout_ops); RCU_INIT_POINTER(nf_ct_timeout_hook, NULL); nf_ct_iterate_destroy(untimeout, NULL); } module_init(cttimeout_init); module_exit(cttimeout_exit);
8 6 9 2 1 74 22 332 21 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H #include <asm/unaligned.h> #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_tables.h> #include <linux/u64_stats_sync.h> #include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/netns/generic.h> #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) struct module; #define NFT_JUMP_STACK_SIZE 16 enum { NFT_PKTINFO_L4PROTO = (1 << 0), NFT_PKTINFO_INNER = (1 << 1), NFT_PKTINFO_INNER_FULL = (1 << 2), }; struct nft_pktinfo { struct sk_buff *skb; const struct nf_hook_state *state; u8 flags; u8 tprot; u16 fragoff; u16 thoff; u16 inneroff; }; static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) { return pkt->state->sk; } static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) { return pkt->thoff; } static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->state->net; } static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) { return pkt->state->hook; } static inline u8 nft_pf(const struct nft_pktinfo *pkt) { return pkt->state->pf; } static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) { return pkt->state->in; } static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) { return pkt->state->out; } static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { pkt->skb = skb; pkt->state = state; } static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->flags = 0; pkt->tprot = 0; pkt->thoff = 0; pkt->fragoff = 0; } /** * struct nft_verdict - nf_tables verdict * * @code: nf_tables/netfilter verdict code * @chain: destination chain for NFT_JUMP/NFT_GOTO */ struct nft_verdict { u32 code; struct nft_chain *chain; }; struct nft_data { union { u32 data[4]; struct nft_verdict verdict; }; } __attribute__((aligned(__alignof__(u64)))); #define NFT_REG32_NUM 20 /** * struct nft_regs - nf_tables register set * * @data: data registers * @verdict: verdict register * * The first four data registers alias to the verdict register. */ struct nft_regs { union { u32 data[NFT_REG32_NUM]; struct nft_verdict verdict; }; }; struct nft_regs_track { struct { const struct nft_expr *selector; const struct nft_expr *bitwise; u8 num_reg; } regs[NFT_REG32_NUM]; const struct nft_expr *cur; const struct nft_expr *last; }; /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit * level. So for store instruction, pad the rest part with zero to avoid * garbage values. */ static inline void nft_reg_store8(u32 *dreg, u8 val) { *dreg = 0; *(u8 *)dreg = val; } static inline u8 nft_reg_load8(const u32 *sreg) { return *(u8 *)sreg; } static inline void nft_reg_store16(u32 *dreg, u16 val) { *dreg = 0; *(u16 *)dreg = val; } static inline void nft_reg_store_be16(u32 *dreg, __be16 val) { nft_reg_store16(dreg, (__force __u16)val); } static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } static inline __be16 nft_reg_load_be16(const u32 *sreg) { return (__force __be16)nft_reg_load16(sreg); } static inline __be32 nft_reg_load_be32(const u32 *sreg) { return *(__force __be32 *)sreg; } static inline void nft_reg_store64(u64 *dreg, u64 val) { put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) { return get_unaligned((u64 *)sreg); } static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { if (len % NFT_REG32_SIZE) dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); } /** * struct nft_ctx - nf_tables rule/set context * * @net: net namespace * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message */ struct nft_ctx { struct net *net; struct nft_table *table; struct nft_chain *chain; const struct nlattr * const *nla; u32 portid; u32 seq; u16 flags; u8 family; u8 level; bool report; }; enum nft_data_desc_flags { NFT_DATA_DESC_SETELEM = (1 << 0), }; struct nft_data_desc { enum nft_data_types type; unsigned int size; unsigned int len; unsigned int flags; }; int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len); static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg) { return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) { return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len); int nft_parse_register_store(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *dreg, const struct nft_data *data, enum nft_data_types type, unsigned int len); /** * struct nft_userdata - user defined data associated with an object * * @len: length of the data * @data: content * * The presence of user data is indicated in an object specific fashion, * so a length of zero can't occur and the value "len" indicates data * of length len + 1. */ struct nft_userdata { u8 len; unsigned char data[]; }; /* placeholder structure for opaque set element backend representation. */ struct nft_elem_priv { }; /** * struct nft_set_elem - generic representation of set elements * * @key: element key * @key_end: closing element key * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key_end; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; struct nft_elem_priv *priv; }; static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) { return (void *)priv; } struct nft_set; struct nft_set_iter { u8 genmask; unsigned int count; unsigned int skip; int err; int (*fn)(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); }; /** * struct nft_set_desc - description of set elements * * @ktype: key type * @klen: key length * @dtype: data type * @dlen: data length * @objtype: object type * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions */ struct nft_set_desc { u32 ktype; unsigned int klen; u32 dtype; unsigned int dlen; u32 objtype; unsigned int size; u32 policy; u32 gc_int; u64 timeout; u8 field_len[NFT_REG32_COUNT]; u8 field_count; bool expr; }; /** * enum nft_set_class - performance class * * @NFT_SET_CLASS_O_1: constant, O(1) * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, NFT_SET_CLASS_O_LOG_N, NFT_SET_CLASS_O_N, }; /** * struct nft_set_estimate - estimation of memory and performance * characteristics * * @size: required memory * @lookup: lookup performance class * @space: memory class */ struct nft_set_estimate { u64 size; enum nft_set_class lookup; enum nft_set_class space; }; #define NFT_EXPR_MAXATTR 16 #define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ ALIGN(size, __alignof__(struct nft_expr))) /** * struct nft_expr - nf_tables expression * * @ops: expression ops * @data: expression private data */ struct nft_expr { const struct nft_expr_ops *ops; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_expr_priv(const struct nft_expr *expr) { return (void *)expr->data; } struct nft_expr_info; int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info); int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); bool nft_expr_reduce_bitwise(struct nft_regs_track *track, const struct nft_expr *expr); struct nft_set_ext; /** * struct nft_set_ops - nf_tables set operations * * @lookup: look up an element within the set * @update: update an element if exists, add it if doesn't exist * @delete: delete an element * @insert: insert new element into set * @activate: activate new element in the next generation * @deactivate: lookup for element and deactivate it in the next generation * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements * @commit: commit set elements * @abort: abort set elements * @privsize: function to return size of set private data * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster * and currently only used in the packet path. All the rest are slower, * control plane functions. */ struct nft_set_ops { bool (*lookup)(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, const u32 *key, struct nft_elem_priv * (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, struct nft_regs *regs, const struct nft_set_ext **ext); bool (*delete)(const struct nft_set *set, const u32 *key); int (*insert)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **priv); void (*activate)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); struct nft_elem_priv * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void (*flush)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *priv); void (*remove)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); struct nft_elem_priv * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); void (*commit)(struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_ctx *ctx, const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; }; /** * struct nft_set_type - nf_tables set type * * @ops: set ops for this type * @features: features supported by the implementation */ struct nft_set_type { const struct nft_set_ops ops; u32 features; }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) struct nft_set_elem_expr { u8 size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; #define nft_setelem_expr_at(__elem_expr, __offset) \ ((struct nft_expr *)&__elem_expr->data[__offset]) #define nft_setelem_expr_foreach(__expr, __elem_expr, __size) \ for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0; \ __size < (__elem_expr)->size; \ __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size) #define NFT_SET_EXPR_MAX 2 /** * struct nft_set - nf_tables set instance * * @list: table set list node * @bindings: list of set bindings * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal * @timeout: default timeout value in jiffies * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length * @num_exprs: numbers of exprs * @exprs: stateful expression * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { struct list_head list; struct list_head bindings; refcount_t refs; struct nft_table *table; possible_net_t net; char *name; u64 handle; u32 ktype; u32 dtype; u32 objtype; u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; u32 use; atomic_t nelems; u32 ndeact; u64 timeout; u32 gc_int; u16 policy; u16 udlen; unsigned char *udata; struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:13, dead:1, genmask:2; u8 klen; u8 dlen; u8 num_exprs; struct nft_expr *exprs[NFT_SET_EXPR_MAX]; struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline bool nft_set_is_anonymous(const struct nft_set *set) { return set->flags & NFT_SET_ANONYMOUS; } static inline void *nft_set_priv(const struct nft_set *set) { return (void *)set->data; } static inline bool nft_set_gc_is_pending(const struct nft_set *s) { return refcount_read(&s->refs) != 1; } static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); } struct nft_set *nft_set_lookup_global(const struct net *net, const struct nft_table *table, const struct nlattr *nla_set_name, const struct nlattr *nla_set_id, u8 genmask); struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { u32 gc_int = READ_ONCE(set->gc_int); return gc_int ? msecs_to_jiffies(gc_int) : HZ; } /** * struct nft_set_binding - nf_tables set binding * * @list: set bindings list node * @chain: chain containing the rule bound to the set * @flags: set action flags * * A set binding contains all information necessary for validation * of new elements added to a bound set. */ struct nft_set_binding { struct list_head list; const struct nft_chain *chain; u32 flags; }; enum nft_trans_phase; void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase); int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_EXPRESSIONS: expressions assiciated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { NFT_SET_EXT_KEY, NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; /** * struct nft_set_ext_type - set extension type * * @len: fixed part length of the extension * @align: alignment requirements of the extension */ struct nft_set_ext_type { u8 len; u8 align; }; extern const struct nft_set_ext_type nft_set_ext_types[]; /** * struct nft_set_ext_tmpl - set extension template * * @len: length of extension area * @offset: offsets of individual extension types * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; u8 ext_len[NFT_SET_EXT_NUM]; }; /** * struct nft_set_ext - set extensions * * @genmask: generation mask * @offset: offsets of individual extension types * @data: beginning of extension data */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; }; static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { memset(tmpl, 0, sizeof(*tmpl)); tmpl->len = sizeof(struct nft_set_ext); } static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, unsigned int len) { tmpl->len = ALIGN(tmpl->len, nft_set_ext_types[id].align); if (tmpl->len > U8_MAX) return -EINVAL; tmpl->offset[id] = tmpl->len; tmpl->ext_len[id] = nft_set_ext_types[id].len + len; tmpl->len += tmpl->ext_len[id]; return 0; } static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id) { return nft_set_ext_add_length(tmpl, id, 0); } static inline void nft_set_ext_init(struct nft_set_ext *ext, const struct nft_set_ext_tmpl *tmpl) { memcpy(ext->offset, tmpl->offset, sizeof(ext->offset)); } static inline bool __nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return ext && __nft_set_ext_exists(ext, id); } static inline void *nft_set_ext(const struct nft_set_ext *ext, u8 id) { return (void *)ext + ext->offset[id]; } static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY); } static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY_END); } static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); } static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } static inline u64 *nft_set_ext_timeout(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } static inline u64 *nft_set_ext_expiration(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPIRATION); } static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, u64 tstamp) { return nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION) && time_after_eq64(tstamp, *nft_set_ext_expiration(ext)); } static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, const struct nft_elem_priv *elem_priv) { return (void *)elem_priv + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_OBJREF); } struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr); void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv); struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type * * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference * @policy: netlink attribute policy * @maxattr: highest netlink attribute number * @family: address family for AF-specific types * @flags: expression type flags */ struct nft_expr_type { const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; const struct nla_policy *policy; unsigned int maxattr; u8 family; u8 flags; }; #define NFT_EXPR_STATEFUL 0x1 #define NFT_EXPR_GC 0x2 enum nft_trans_phase { NFT_TRANS_PREPARE, NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE }; struct nft_flow_rule; struct nft_offload_ctx; /** * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu * @destroy_clone: destruction clone function * @dump: function to dump parameters * @validate: validate expression, called during loop detection * @reduce: reduce expression * @gc: garbage collection expression * @offload: hardware offload expression * @offload_action: function to report true/false to allocate one slot or not in the flow * offload array * @offload_stats: function to synchronize hardware stats via updating the counter expression * @type: expression type * @data: extra data to attach to this expression operation */ struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, const struct nft_expr *src); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); void (*activate)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*deactivate)(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); bool (*reduce)(struct nft_regs_track *track, const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); const struct nft_expr_type *type; void *data; }; /** * struct nft_rule - nf_tables rule * * @list: used internally * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data * @udata: user data is appended to the rule * @data: expression data */ struct nft_rule { struct list_head list; u64 handle:42, genmask:2, dlen:12, udata:1; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[0]; } static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr) { return ((void *)expr) + expr->ops->size; } static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[rule->dlen]; } static inline bool nft_expr_more(const struct nft_rule *rule, const struct nft_expr *expr) { return expr != nft_expr_last(rule) && expr->ops; } static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; } void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, enum nft_trans_phase phase); void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_set_elem_expr *elem_expr; struct nft_expr *expr; u32 size; if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) { elem_expr = nft_set_ext_expr(ext); nft_setelem_expr_foreach(expr, elem_expr, size) { expr->ops->eval(expr, regs, pkt); if (regs->verdict.code == NFT_BREAK) return; } } } /* * The last pointer isn't really necessary, but the compiler isn't able to * determine that the result of nft_expr_last() is always the same since it * can't assume that the dlen value wasn't changed within calls in the loop. */ #define nft_rule_for_each_expr(expr, last, rule) \ for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \ (expr) != (last); \ (expr) = nft_expr_next(expr)) #define NFT_CHAIN_POLICY_UNSET U8_MAX struct nft_rule_dp { u64 is_last:1, dlen:12, handle:42; /* for tracing */ unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; struct nft_rule_dp_last { struct nft_rule_dp end; /* end of nft_rule_blob marker */ struct rcu_head h; /* call_rcu head */ struct nft_rule_blob *blob; /* ptr to free via call_rcu */ const struct nft_chain *chain; /* for nftables tracing */ }; static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule) { return (void *)rule + sizeof(*rule) + rule->dlen; } struct nft_rule_blob { unsigned long size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_rule_dp)))); }; /** * struct nft_chain - nf_tables chain * * @blob_gen_0: rule blob pointer to the current generation * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain * @flags: bitmask of enum NFTA_CHAIN_FLAGS * @bound: bind or not * @genmask: generation mask * @name: name of the chain * @udlen: user data length * @udata: user data in the chain * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; struct nft_rule_blob __rcu *blob_gen_1; struct list_head rules; struct list_head list; struct rhlist_head rhlhead; struct nft_table *table; u64 handle; u32 use; u8 flags:5, bound:1, genmask:2; char *name; u16 udlen; u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, NFT_CHAIN_T_NAT, NFT_CHAIN_T_MAX }; /** * struct nft_chain_type - nf_tables chain type info * * @name: name of the type * @type: numeric identifier * @family: address family * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions * @ops_register: base chain register function * @ops_unregister: base chain unregister function */ struct nft_chain_type { const char *name; enum nft_chain_types type; int family; struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; int nft_chain_validate_dependency(const struct nft_chain *chain, enum nft_chain_types type); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); static inline bool nft_chain_binding(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BINDING; } static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_ctx *ctx); struct nft_stats { u64 bytes; u64 pkts; struct u64_stats_sync syncp; }; struct nft_hook { struct list_head list; struct nf_hook_ops ops; struct rcu_head rcu; }; /** * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; struct list_head hook_list; const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; struct nft_chain chain; struct flow_block flow_block; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) { return container_of(chain, struct nft_base_chain, chain); } static inline bool nft_is_base_chain(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); static inline bool nft_use_inc(u32 *use) { if (*use == UINT_MAX) return false; (*use)++; return true; } static inline void nft_use_dec(u32 *use) { WARN_ON_ONCE((*use)-- == 0); } /* For error and abort path: restore use counter to previous state. */ static inline void nft_use_inc_restore(u32 *use) { WARN_ON_ONCE(!nft_use_inc(use)); } #define nft_use_dec_restore nft_use_dec /** * struct nft_table - nf_tables table * * @list: used internally * @chains_ht: chains in the table * @chains: same, for stable walks * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table * @family:address family * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask * @nlpid: netlink port ID * @name: name of the table * @udlen: length of the user data * @udata: user data * @validate_state: internal, set when transaction adds jumps */ struct nft_table { struct list_head list; struct rhltable chains_ht; struct list_head chains; struct list_head sets; struct list_head objects; struct list_head flowtables; u64 hgenerator; u64 handle; u32 use; u16 family:6, flags:8, genmask:2; u32 nlpid; char *name; u16 udlen; u8 *udata; u8 validate_state; }; static inline bool nft_table_has_owner(const struct nft_table *table) { return table->flags & NFT_TABLE_F_OWNER; } static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || (family == NFPROTO_INET && hooknum == NF_INET_INGRESS); } void nft_register_chain_type(const struct nft_chain_type *); void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v); /** * struct nft_object_hash_key - key to lookup nft_object * * @name: name of the stateful object to look up * @table: table the object belongs to */ struct nft_object_hash_key { const char *name; const struct nft_table *table; }; /** * struct nft_object - nf_tables stateful object * * @list: table stateful object list node * @rhlhead: nft_objname_ht node * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle * @udlen: length of user data * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; u32 genmask:2; u32 use; u64 handle; u16 udlen; u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_obj_data(const struct nft_object *obj) { return (void *)obj->data; } #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) struct nft_object *nft_obj_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type * * @select_ops: function to select nft_object_ops * @ops: default ops, used when no select_ops functions is present * @list: list node in list of object types * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute * @family: address family for AF-specific object types * @policy: netlink attribute policy */ struct nft_object_type { const struct nft_object_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); const struct nft_object_ops *ops; struct list_head list; u32 type; unsigned int maxattr; u8 family; struct module *owner; const struct nla_policy *policy; }; /** * struct nft_object_ops - stateful object operations * * @eval: stateful object evaluation function * @size: stateful object size * @init: initialize object from netlink attributes * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nlattr *const tb[], struct nft_object *obj); void (*destroy)(const struct nft_ctx *ctx, struct nft_object *obj); int (*dump)(struct sk_buff *skb, struct nft_object *obj, bool reset); void (*update)(struct nft_object *obj, struct nft_object *newobj); const struct nft_object_type *type; }; int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); #define NFT_NETDEVICE_MAX 256 /** * struct nft_flowtable - nf_tables flow table * * @list: flow table list node in table list * @table: the table the flow table is contained in * @name: name of this flow table * @hooknum: hook number * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector */ struct nft_flowtable { struct list_head list; struct nft_table *table; char *name; int hooknum; int ops_len; u32 genmask:2; u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, struct nft_flowtable *flowtable, enum nft_trans_phase phase); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * * @trace: other struct members are initialised * @nf_trace: copy of skb->nf_trace before rule evaluation * @type: event type (enum nft_trace_types) * @skbid: hash of skb to be used as trace id * @packet_dumped: packet headers sent in a previous traceinfo message * @basechain: base chain currently processed */ struct nft_traceinfo { bool trace; bool nf_trace; bool packet_dumped; enum nft_trace_types type:8; u32 skbid; const struct nft_base_chain *basechain; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_chain *basechain); void nft_trace_notify(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_rule_dp *rule, struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_AF_EXPR(family, name) \ MODULE_ALIAS("nft-expr-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) #define MODULE_ALIAS_NFT_OBJ(type) \ MODULE_ALIAS("nft-obj-" __stringify(type)) #if IS_ENABLED(CONFIG_NF_TABLES) /* * The gencursor defines two generations, the currently active and the * next one. Objects contain a bitmask of 2 bits specifying the generations * they're active in. A set bit means they're inactive in the generation * represented by that bit. * * New objects start out as inactive in the current and active in the * next generation. When committing the ruleset the bitmask is cleared, * meaning they're active in all generations. When removing an object, * it is set inactive in the next generation. After committing the ruleset, * the objects are removed. */ static inline unsigned int nft_gencursor_next(const struct net *net) { return net->nft.gencursor + 1 == 1 ? 1 : 0; } static inline u8 nft_genmask_next(const struct net *net) { return 1 << nft_gencursor_next(net); } static inline u8 nft_genmask_cur(const struct net *net) { /* Use READ_ONCE() to prevent refetching the value for atomicity */ return 1 << READ_ONCE(net->nft.gencursor); } #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) /* * Generic transaction helpers */ /* Check if this object is currently active. */ #define nft_is_active(__net, __obj) \ (((__obj)->genmask & nft_genmask_cur(__net)) == 0) /* Check if this object is active in the next generation. */ #define nft_is_active_next(__net, __obj) \ (((__obj)->genmask & nft_genmask_next(__net)) == 0) /* This object becomes active in the next generation. */ #define nft_activate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_cur(__net) /* This object becomes inactive in the next generation. */ #define nft_deactivate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_next(__net) /* After committing the ruleset, clear the stale generation bit. */ #define nft_clear(__net, __obj) \ (__obj)->genmask &= ~nft_genmask_next(__net) #define nft_active_genmask(__obj, __genmask) \ !((__obj)->genmask & __genmask) /* * Set element transaction helpers */ static inline bool nft_set_elem_active(const struct nft_set_ext *ext, u8 genmask) { return !(ext->genmask & genmask); } static inline void nft_set_elem_change_active(const struct net *net, const struct nft_set *set, struct nft_set_ext *ext) { ext->genmask ^= nft_genmask_next(net); } #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ #define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); set_bit(NFT_SET_ELEM_DEAD_BIT, word); } static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** * struct nft_trans - nf_tables object update in transaction * * @list: used internally * @binding_list: list of objects with possible bindings * @msg_type: message type * @put_net: ctx->net needs to be put * @ctx: transaction context * @data: internal information related to the transaction */ struct nft_trans { struct list_head list; struct list_head binding_list; int msg_type; bool put_net; struct nft_ctx ctx; char data[]; }; struct nft_trans_rule { struct nft_rule *rule; struct nft_flow_rule *flow; u32 rule_id; bool bound; }; #define nft_trans_rule(trans) \ (((struct nft_trans_rule *)trans->data)->rule) #define nft_trans_flow_rule(trans) \ (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) #define nft_trans_rule_bound(trans) \ (((struct nft_trans_rule *)trans->data)->bound) struct nft_trans_set { struct nft_set *set; u32 set_id; u32 gc_int; u64 timeout; bool update; bool bound; u32 size; }; #define nft_trans_set(trans) \ (((struct nft_trans_set *)trans->data)->set) #define nft_trans_set_id(trans) \ (((struct nft_trans_set *)trans->data)->set_id) #define nft_trans_set_bound(trans) \ (((struct nft_trans_set *)trans->data)->bound) #define nft_trans_set_update(trans) \ (((struct nft_trans_set *)trans->data)->update) #define nft_trans_set_timeout(trans) \ (((struct nft_trans_set *)trans->data)->timeout) #define nft_trans_set_gc_int(trans) \ (((struct nft_trans_set *)trans->data)->gc_int) #define nft_trans_set_size(trans) \ (((struct nft_trans_set *)trans->data)->size) struct nft_trans_chain { struct nft_chain *chain; bool update; char *name; struct nft_stats __percpu *stats; u8 policy; bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; #define nft_trans_chain(trans) \ (((struct nft_trans_chain *)trans->data)->chain) #define nft_trans_chain_update(trans) \ (((struct nft_trans_chain *)trans->data)->update) #define nft_trans_chain_name(trans) \ (((struct nft_trans_chain *)trans->data)->name) #define nft_trans_chain_stats(trans) \ (((struct nft_trans_chain *)trans->data)->stats) #define nft_trans_chain_policy(trans) \ (((struct nft_trans_chain *)trans->data)->policy) #define nft_trans_chain_bound(trans) \ (((struct nft_trans_chain *)trans->data)->bound) #define nft_trans_chain_id(trans) \ (((struct nft_trans_chain *)trans->data)->chain_id) #define nft_trans_basechain(trans) \ (((struct nft_trans_chain *)trans->data)->basechain) #define nft_trans_chain_hooks(trans) \ (((struct nft_trans_chain *)trans->data)->hook_list) struct nft_trans_table { bool update; }; #define nft_trans_table_update(trans) \ (((struct nft_trans_table *)trans->data)->update) struct nft_trans_elem { struct nft_set *set; struct nft_elem_priv *elem_priv; bool bound; }; #define nft_trans_elem_set(trans) \ (((struct nft_trans_elem *)trans->data)->set) #define nft_trans_elem_priv(trans) \ (((struct nft_trans_elem *)trans->data)->elem_priv) #define nft_trans_elem_set_bound(trans) \ (((struct nft_trans_elem *)trans->data)->bound) struct nft_trans_obj { struct nft_object *obj; struct nft_object *newobj; bool update; }; #define nft_trans_obj(trans) \ (((struct nft_trans_obj *)trans->data)->obj) #define nft_trans_obj_newobj(trans) \ (((struct nft_trans_obj *)trans->data)->newobj) #define nft_trans_obj_update(trans) \ (((struct nft_trans_obj *)trans->data)->update) struct nft_trans_flowtable { struct nft_flowtable *flowtable; bool update; struct list_head hook_list; u32 flags; }; #define nft_trans_flowtable(trans) \ (((struct nft_trans_flowtable *)trans->data)->flowtable) #define nft_trans_flowtable_update(trans) \ (((struct nft_trans_flowtable *)trans->data)->update) #define nft_trans_flowtable_hooks(trans) \ (((struct nft_trans_flowtable *)trans->data)->hook_list) #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) #define NFT_TRANS_GC_BATCHCOUNT 256 struct nft_trans_gc { struct list_head list; struct net *net; struct nft_set *set; u32 seq; u16 count; struct nft_elem_priv *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_destroy(struct nft_trans_gc *trans); struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, unsigned int gc_seq); struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); void nf_tables_trans_destroy_flush_work(void); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); #ifdef CONFIG_MODULES __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); #else static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } #endif struct nftables_pernet { struct list_head tables; struct list_head commit_list; struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; u64 tstamp; unsigned int base_seq; unsigned int gc_seq; u8 validate_state; }; extern unsigned int nf_tables_net_id; static inline struct nftables_pernet *nft_pernet(const struct net *net) { return net_generic(net, nf_tables_net_id); } static inline u64 nft_net_tstamp(const struct net *net) { return nft_pernet(net)->tstamp; } #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) { return expr->ops->reduce == NFT_REDUCE_READONLY; } void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); static inline bool nft_reg_track_cmp(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg) { return track->regs[dreg].selector && track->regs[dreg].selector->ops == expr->ops && track->regs[dreg].num_reg == 0; } #endif /* _NET_NF_TABLES_H */
76 77 95 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TRACE_EVENT_H #define _LINUX_TRACE_EVENT_H #include <linux/ring_buffer.h> #include <linux/trace_seq.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/perf_event.h> #include <linux/tracepoint.h> struct trace_array; struct array_buffer; struct tracer; struct dentry; struct bpf_prog; union bpf_attr; const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array); #if BITS_PER_LONG == 32 const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, const struct trace_print_flags_u64 *flag_array); const char *trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array); #endif const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, unsigned int bitmask_size); const char *trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len, bool concatenate); const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, size_t el_size); const char * trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); struct trace_iterator; struct trace_event; int trace_raw_output_prep(struct trace_iterator *iter, struct trace_event *event); extern __printf(2, 3) void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...); /* Used to find the offset and length of dynamic fields in trace events */ struct trace_dynamic_info { #ifdef CONFIG_CPU_BIG_ENDIAN u16 len; u16 offset; #else u16 offset; u16 len; #endif } __packed; /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: * * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ struct trace_entry { unsigned short type; unsigned char flags; unsigned char preempt_count; int pid; }; #define TRACE_EVENT_TYPE_MAX \ ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) /* * Trace iterator - used by printout routines who present trace * results to users and which routines might sleep, etc: */ struct trace_iterator { struct trace_array *tr; struct tracer *trace; struct array_buffer *array_buffer; void *private; int cpu_file; struct mutex mutex; struct ring_buffer_iter **buffer_iter; unsigned long iter_flags; void *temp; /* temp holder */ unsigned int temp_size; char *fmt; /* modified format holder */ unsigned int fmt_size; long wait_index; /* trace_seq for __print_flags() and __print_symbolic() etc. */ struct trace_seq tmp_seq; cpumask_var_t started; /* it's true when current open file is snapshot */ bool snapshot; /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; unsigned long lost_events; int leftover; int ent_size; int cpu; u64 ts; loff_t pos; long idx; /* All new field here will be zeroed out in pipe_read */ }; enum trace_iter_flags { TRACE_FILE_LAT_FMT = 1, TRACE_FILE_ANNOTATE = 2, TRACE_FILE_TIME_IN_NS = 4, }; typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, int flags, struct trace_event *event); struct trace_event_functions { trace_print_func trace; trace_print_func raw; trace_print_func hex; trace_print_func binary; }; struct trace_event { struct hlist_node node; int type; struct trace_event_functions *funcs; }; extern int register_trace_event(struct trace_event *event); extern int unregister_trace_event(struct trace_event *event); /* Return values for print_line callback */ enum print_line_t { TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ TRACE_TYPE_HANDLED = 1, TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; enum print_line_t trace_handle_return(struct trace_seq *s); static inline void tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, unsigned int trace_ctx) { entry->preempt_count = trace_ctx & 0xff; entry->pid = current->pid; entry->type = type; entry->flags = trace_ctx >> 16; } unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, TRACE_FLAG_IRQS_NOSUPPORT = 0x02, TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, TRACE_FLAG_NMI = 0x40, TRACE_FLAG_BH_OFF = 0x80, }; #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { unsigned int irq_status = irqs_disabled_flags(irqflags) ? TRACE_FLAG_IRQS_OFF : 0; return tracing_gen_ctx_irq_test(irq_status); } static inline unsigned int tracing_gen_ctx(void) { unsigned long irqflags; local_save_flags(irqflags); return tracing_gen_ctx_flags(irqflags); } #else static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); } static inline unsigned int tracing_gen_ctx(void) { return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); } #endif static inline unsigned int tracing_gen_ctx_dec(void) { unsigned int trace_ctx; trace_ctx = tracing_gen_ctx(); /* * Subtract one from the preemption counter if preemption is enabled, * see trace_event_buffer_reserve()for details. */ if (IS_ENABLED(CONFIG_PREEMPTION)) trace_ctx--; return trace_ctx; } struct trace_event_file; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_buffer, struct trace_event_file *trace_file, int type, unsigned long len, unsigned int trace_ctx); #define TRACE_RECORD_CMDLINE BIT(0) #define TRACE_RECORD_TGID BIT(1) void tracing_record_taskinfo(struct task_struct *task, int flags); void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags); void tracing_record_cmdline(struct task_struct *task); void tracing_record_tgid(struct task_struct *task); int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) __printf(3, 4); struct event_filter; enum trace_reg { TRACE_REG_REGISTER, TRACE_REG_UNREGISTER, #ifdef CONFIG_PERF_EVENTS TRACE_REG_PERF_REGISTER, TRACE_REG_PERF_UNREGISTER, TRACE_REG_PERF_OPEN, TRACE_REG_PERF_CLOSE, /* * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a * custom action was taken and the default action is not to be * performed. */ TRACE_REG_PERF_ADD, TRACE_REG_PERF_DEL, #endif }; struct trace_event_call; #define TRACE_FUNCTION_TYPE ((const char *)~0UL) struct trace_event_fields { const char *type; union { struct { const char *name; const int size; const int align; const int is_signed; const int filter_type; const int len; }; int (*define_fields)(struct trace_event_call *); }; }; struct trace_event_class { const char *system; void *probe; #ifdef CONFIG_PERF_EVENTS void *perf_probe; #endif int (*reg)(struct trace_event_call *event, enum trace_reg type, void *data); struct trace_event_fields *fields_array; struct list_head *(*get_fields)(struct trace_event_call *); struct list_head fields; int (*raw_init)(struct trace_event_call *); }; extern int trace_event_reg(struct trace_event_call *event, enum trace_reg type, void *data); struct trace_event_buffer { struct trace_buffer *buffer; struct ring_buffer_event *event; struct trace_event_file *trace_file; void *entry; unsigned int trace_ctx; struct pt_regs *regs; }; void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len); void trace_event_buffer_commit(struct trace_event_buffer *fbuffer); enum { TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, TRACE_EVENT_FL_DYNAMIC_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, }; /* * Event flags: * FILTERED - The event has a filter attached * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file * TRACEPOINT - Event is a tracepoint * DYNAMIC - Event is a dynamic event (created at run time) * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe * EPROBE - Event is an event probe * FPROBE - Event is an function probe * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), TRACE_EVENT_FL_DYNAMIC = (1 << TRACE_EVENT_FL_DYNAMIC_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) struct trace_event_call { struct list_head list; struct trace_event_class *class; union { char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; struct trace_event event; char *print_fmt; struct event_filter *filter; /* * Static events can disappear with modules, * where as dynamic ones need their own ref count. */ union { void *module; atomic_t refcnt; }; void *data; /* See the TRACE_EVENT_FL_* flags above */ int flags; /* static flags of different events */ #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; struct bpf_prog_array __rcu *prog_array; int (*perf_perm)(struct trace_event_call *, struct perf_event *); #endif }; #ifdef CONFIG_DYNAMIC_EVENTS bool trace_event_dyn_try_get_ref(struct trace_event_call *call); void trace_event_dyn_put_ref(struct trace_event_call *call); bool trace_event_dyn_busy(struct trace_event_call *call); #else static inline bool trace_event_dyn_try_get_ref(struct trace_event_call *call) { /* Without DYNAMIC_EVENTS configured, nothing should be calling this */ return false; } static inline void trace_event_dyn_put_ref(struct trace_event_call *call) { } static inline bool trace_event_dyn_busy(struct trace_event_call *call) { /* Nothing should call this without DYNAIMIC_EVENTS configured. */ return true; } #endif static inline bool trace_event_try_get_ref(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_DYNAMIC) return trace_event_dyn_try_get_ref(call); else return try_module_get(call->module); } static inline void trace_event_put_ref(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_DYNAMIC) trace_event_dyn_put_ref(call); else module_put(call->module); } #ifdef CONFIG_PERF_EVENTS static inline bool bpf_prog_array_valid(struct trace_event_call *call) { /* * This inline function checks whether call->prog_array * is valid or not. The function is called in various places, * outside rcu_read_lock/unlock, as a heuristic to speed up execution. * * If this function returns true, and later call->prog_array * becomes false inside rcu_read_lock/unlock region, * we bail out then. If this function return false, * there is a risk that we might miss a few events if the checking * were delayed until inside rcu_read_lock/unlock region and * call->prog_array happened to become non-NULL then. * * Here, READ_ONCE() is used instead of rcu_access_pointer(). * rcu_access_pointer() requires the actual definition of * "struct bpf_prog_array" while READ_ONCE() only needs * a declaration of the same type. */ return !!READ_ONCE(call->prog_array); } #endif static inline const char * trace_event_name(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_CUSTOM) return call->name; else if (call->flags & TRACE_EVENT_FL_TRACEPOINT) return call->tp ? call->tp->name : NULL; else return call->name; } static inline struct list_head * trace_get_fields(struct trace_event_call *event_call) { if (!event_call->class->get_fields) return &event_call->class->fields; return event_call->class->get_fields(event_call); } struct trace_subsystem_dir; enum { EVENT_FILE_FL_ENABLED_BIT, EVENT_FILE_FL_RECORDED_CMD_BIT, EVENT_FILE_FL_RECORDED_TGID_BIT, EVENT_FILE_FL_FILTERED_BIT, EVENT_FILE_FL_NO_SET_FILTER_BIT, EVENT_FILE_FL_SOFT_MODE_BIT, EVENT_FILE_FL_SOFT_DISABLED_BIT, EVENT_FILE_FL_TRIGGER_MODE_BIT, EVENT_FILE_FL_TRIGGER_COND_BIT, EVENT_FILE_FL_PID_FILTER_BIT, EVENT_FILE_FL_WAS_ENABLED_BIT, EVENT_FILE_FL_FREED_BIT, }; extern struct trace_event_file *trace_get_event_file(const char *instance, const char *system, const char *event); extern void trace_put_event_file(struct trace_event_file *file); #define MAX_DYNEVENT_CMD_LEN (2048) enum dynevent_type { DYNEVENT_TYPE_SYNTH = 1, DYNEVENT_TYPE_KPROBE, DYNEVENT_TYPE_NONE, }; struct dynevent_cmd; typedef int (*dynevent_create_fn_t)(struct dynevent_cmd *cmd); struct dynevent_cmd { struct seq_buf seq; const char *event_name; unsigned int n_fields; enum dynevent_type type; dynevent_create_fn_t run_command; void *private_data; }; extern int dynevent_create(struct dynevent_cmd *cmd); extern int synth_event_delete(const char *name); extern void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen); extern int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, struct module *mod, ...); #define synth_event_gen_cmd_start(cmd, name, mod, ...) \ __synth_event_gen_cmd_start(cmd, name, mod, ## __VA_ARGS__, NULL) struct synth_field_desc { const char *type; const char *name; }; extern int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, struct module *mod, struct synth_field_desc *fields, unsigned int n_fields); extern int synth_event_create(const char *name, struct synth_field_desc *fields, unsigned int n_fields, struct module *mod); extern int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, const char *name); extern int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name); extern int synth_event_add_fields(struct dynevent_cmd *cmd, struct synth_field_desc *fields, unsigned int n_fields); #define synth_event_gen_cmd_end(cmd) \ dynevent_create(cmd) struct synth_event; struct synth_event_trace_state { struct trace_event_buffer fbuffer; struct synth_trace_event *entry; struct trace_buffer *buffer; struct synth_event *event; unsigned int cur_field; unsigned int n_u64; bool disabled; bool add_next; bool add_name; }; extern int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...); extern int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals); extern int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state); extern int synth_event_add_next_val(u64 val, struct synth_event_trace_state *trace_state); extern int synth_event_add_val(const char *field_name, u64 val, struct synth_event_trace_state *trace_state); extern int synth_event_trace_end(struct synth_event_trace_state *trace_state); extern int kprobe_event_delete(const char *name); extern void kprobe_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen); #define kprobe_event_gen_cmd_start(cmd, name, loc, ...) \ __kprobe_event_gen_cmd_start(cmd, false, name, loc, ## __VA_ARGS__, NULL) #define kretprobe_event_gen_cmd_start(cmd, name, loc, ...) \ __kprobe_event_gen_cmd_start(cmd, true, name, loc, ## __VA_ARGS__, NULL) extern int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, const char *name, const char *loc, ...); #define kprobe_event_add_fields(cmd, ...) \ __kprobe_event_add_fields(cmd, ## __VA_ARGS__, NULL) #define kprobe_event_add_field(cmd, field) \ __kprobe_event_add_fields(cmd, field, NULL) extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...); #define kprobe_event_gen_cmd_end(cmd) \ dynevent_create(cmd) #define kretprobe_event_gen_cmd_end(cmd) \ dynevent_create(cmd) /* * Event file flags: * ENABLED - The event is enabled * RECORDED_CMD - The comms should be recorded at sched_switch * RECORDED_TGID - The tgids should be recorded at sched_switch * FILTERED - The event has a filter attached * NO_SET_FILTER - Set when filter has error and is to be ignored * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED * SOFT_DISABLED - When set, do not trace the event (even though its * tracepoint may be enabled) * TRIGGER_MODE - When set, invoke the triggers associated with the event * TRIGGER_COND - When set, one or more triggers has an associated filter * PID_FILTER - When set, the event is filtered based on pid * WAS_ENABLED - Set when enabled to know to clear trace on module removal * FREED - File descriptor is freed, all fields should be considered invalid */ enum { EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), EVENT_FILE_FL_RECORDED_CMD = (1 << EVENT_FILE_FL_RECORDED_CMD_BIT), EVENT_FILE_FL_RECORDED_TGID = (1 << EVENT_FILE_FL_RECORDED_TGID_BIT), EVENT_FILE_FL_FILTERED = (1 << EVENT_FILE_FL_FILTERED_BIT), EVENT_FILE_FL_NO_SET_FILTER = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT), EVENT_FILE_FL_SOFT_MODE = (1 << EVENT_FILE_FL_SOFT_MODE_BIT), EVENT_FILE_FL_SOFT_DISABLED = (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT), EVENT_FILE_FL_TRIGGER_MODE = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT), EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT), EVENT_FILE_FL_WAS_ENABLED = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT), EVENT_FILE_FL_FREED = (1 << EVENT_FILE_FL_FREED_BIT), }; struct trace_event_file { struct list_head list; struct trace_event_call *event_call; struct event_filter __rcu *filter; struct eventfs_inode *ei; struct trace_array *tr; struct trace_subsystem_dir *system; struct list_head triggers; /* * 32 bit flags: * bit 0: enabled * bit 1: enabled cmd record * bit 2: enable/disable with the soft disable bit * bit 3: soft disabled * bit 4: trigger enabled * * Note: The bits must be set atomically to prevent races * from other writers. Reads of flags do not need to be in * sync as they occur in critical sections. But the way flags * is currently used, these changes do not affect the code * except that when a change is made, it may have a slight * delay in propagating the changes to other CPUs due to * caching and such. Which is mostly OK ;-) */ unsigned long flags; atomic_t ref; /* ref count for opened files */ atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; #define __TRACE_EVENT_FLAGS(name, value) \ static int __init trace_init_flags_##name(void) \ { \ event_##name.flags |= value; \ return 0; \ } \ early_initcall(trace_init_flags_##name); #define __TRACE_EVENT_PERF_PERM(name, expr...) \ static int perf_perm_##name(struct trace_event_call *tp_event, \ struct perf_event *p_event) \ { \ return ({ expr; }); \ } \ static int __init trace_init_perf_perm_##name(void) \ { \ event_##name.perf_perm = &perf_perm_##name; \ return 0; \ } \ early_initcall(trace_init_perf_perm_##name); #define PERF_MAX_TRACE_SIZE 8192 #define MAX_FILTER_STR_VAL 256U /* Should handle KSYM_SYMBOL_LEN */ enum event_trigger_type { ETT_NONE = (0), ETT_TRACE_ONOFF = (1 << 0), ETT_SNAPSHOT = (1 << 1), ETT_STACKTRACE = (1 << 2), ETT_EVENT_ENABLE = (1 << 3), ETT_EVENT_HIST = (1 << 4), ETT_HIST_ENABLE = (1 << 5), ETT_EVENT_EPROBE = (1 << 6), }; extern int filter_match_preds(struct event_filter *filter, void *rec); extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event); extern void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); bool __trace_trigger_soft_disabled(struct trace_event_file *file); /** * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test * * If any triggers without filters are attached to this event, they * will be called here. If the event is soft disabled and has no * triggers that require testing the fields, it will return true, * otherwise false. */ static __always_inline bool trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; if (likely(!(eflags & (EVENT_FILE_FL_TRIGGER_MODE | EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_PID_FILTER)))) return false; if (likely(eflags & EVENT_FILE_FL_TRIGGER_COND)) return false; return __trace_trigger_soft_disabled(file); } #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr, unsigned long *missed); int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { return 1; } static inline int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } static inline void perf_event_detach_bpf_prog(struct perf_event *event) { } static inline int perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) { return -EOPNOTSUPP; } static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) { return -EOPNOTSUPP; } static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { return NULL; } static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { } static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr, unsigned long *missed) { return -EOPNOTSUPP; } static inline int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } static inline int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } #endif enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, FILTER_DYN_STRING, FILTER_RDYN_STRING, FILTER_PTR_STRING, FILTER_TRACE_FN, FILTER_CPUMASK, FILTER_COMM, FILTER_CPU, FILTER_STACKTRACE, }; extern int trace_event_raw_init(struct trace_event_call *call); extern int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type); extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); extern int trace_event_get_offsets(struct trace_event_call *call); int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); int trace_array_set_clr_event(struct trace_array *tr, const char *system, const char *event, bool enable); /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a * constant. Even with the outer if statement optimizing out. */ #define event_trace_printk(ip, fmt, args...) \ do { \ __trace_printk_check_format(fmt, ##args); \ tracing_record_cmdline(current); \ if (__builtin_constant_p(fmt)) { \ static const char *trace_printk_fmt \ __section("__trace_printk_fmt") = \ __builtin_constant_p(fmt) ? fmt : NULL; \ \ __trace_bprintk(ip, trace_printk_fmt, ##args); \ } else \ __trace_printk(ip, fmt, ##args); \ } while (0) #ifdef CONFIG_PERF_EVENTS struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); extern int perf_trace_add(struct perf_event *event, int flags); extern void perf_trace_del(struct perf_event *event, int flags); #ifdef CONFIG_KPROBE_EVENTS extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); extern int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, const char **symbol, u64 *probe_offset, u64 *probe_addr, unsigned long *missed, bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS extern int perf_uprobe_init(struct perf_event *event, unsigned long ref_ctr_offset, bool is_retprobe); extern void perf_uprobe_destroy(struct perf_event *event); extern int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, const char **filename, u64 *probe_offset, u64 *probe_addr, bool perf_type_tracepoint); #endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_free_bpf_prog(struct perf_event *event); void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3); void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4); void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5); void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6); void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8); void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9); void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10); void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11); void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task); static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type, u64 count, struct pt_regs *regs, void *head, struct task_struct *task) { perf_tp_event(type, count, raw_data, size, regs, head, rctx, task); } #endif #define TRACE_EVENT_STR_MAX 512 /* * gcc warns that you can not use a va_list in an inlined * function. But lets me make it into a macro :-/ */ #define __trace_event_vstr_len(fmt, va) \ ({ \ va_list __ap; \ int __ret; \ \ va_copy(__ap, *(va)); \ __ret = vsnprintf(NULL, 0, fmt, __ap) + 1; \ va_end(__ap); \ \ min(__ret, TRACE_EVENT_STR_MAX); \ }) #endif /* _LINUX_TRACE_EVENT_H */ /* * Note: we keep the TRACE_CUSTOM_EVENT outside the include file ifdef protection. * This is due to the way trace custom events work. If a file includes two * trace event headers under one "CREATE_CUSTOM_TRACE_EVENTS" the first include * will override the TRACE_CUSTOM_EVENT and break the second include. */ #ifndef TRACE_CUSTOM_EVENT #define DECLARE_CUSTOM_EVENT_CLASS(name, proto, args, tstruct, assign, print) #define DEFINE_CUSTOM_EVENT(template, name, proto, args) #define TRACE_CUSTOM_EVENT(name, proto, args, struct, assign, print) #endif /* ifdef TRACE_CUSTOM_EVENT (see note above) */
52 3 2 1 13 1 5 5 5 5 1 4 5 2 2 2 2 6 6 4 2 7 24 7 17 1 15 6 21 9 1 8 5 4 2 1 5 6 6 4 4 4 1 3 3 1 2 1 2 5 2 4 3 16 27 28 10 10 4 6 58 1 1 50 6 44 12 51 1 4 54 20 20 20 19 1 2 4 14 1 14 11 3 10 1 4 4 1 7 8 8 8 23 23 23 1 1 20 20 11 1 2 2 10 12 6 5 1 6 8 1 1 5 1 5 2 3 5 5 2 1 1 2 1 1 45 46 32 7 6 19 1 1 1 16 15 15 15 15 1 14 14 9 2 7 7 1 6 1 6 7 4 67 1 66 61 10 61 60 11 2 11 50 8 59 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/compat.h> #include <net/compat.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "kbuf.h" #include "alloc_cache.h" #include "net.h" #include "notif.h" #include "rsrc.h" #if defined(CONFIG_NET) struct io_shutdown { struct file *file; int how; }; struct io_accept { struct file *file; struct sockaddr __user *addr; int __user *addr_len; int flags; u32 file_slot; unsigned long nofile; }; struct io_socket { struct file *file; int domain; int type; int protocol; int flags; u32 file_slot; unsigned long nofile; }; struct io_connect { struct file *file; struct sockaddr __user *addr; int addr_len; bool in_progress; bool seen_econnaborted; }; struct io_sr_msg { struct file *file; union { struct compat_msghdr __user *umsg_compat; struct user_msghdr __user *umsg; void __user *buf; }; unsigned len; unsigned done_io; unsigned msg_flags; unsigned nr_multishot_loops; u16 flags; /* initialised and used only by !msg send variants */ u16 addr_len; u16 buf_group; void __user *addr; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; }; /* * Number of times we'll try and do receives if there's more data. If we * exceed this limit, then add us to the back of the queue and retry from * there. This helps fairness between flooding clients. */ #define MULTISHOT_MAX_RETRY 32 static inline bool io_check_multishot(struct io_kiocb *req, unsigned int issue_flags) { /* * When ->locked_cq is set we only allow to post CQEs from the original * task context. Usual request completions will be handled in other * generic paths but multipoll may decide to post extra cqes. */ return !(issue_flags & IO_URING_F_IOWQ) || !(issue_flags & IO_URING_F_MULTISHOT) || !req->ctx->task_complete; } int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; shutdown->how = READ_ONCE(sqe->len); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); struct socket *sock; int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; ret = __sys_shutdown_sock(sock, shutdown->how); io_req_set_res(req, ret, 0); return IOU_OK; } static bool io_net_retry(struct socket *sock, int flags) { if (!(flags & MSG_WAITALL)) return false; return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; } static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr *hdr = req->async_data; if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED) return; /* Let normal cleanup path reap it if we fail adding to the cache */ if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) { req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } } static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_cache_entry *entry; struct io_async_msghdr *hdr; if (!(issue_flags & IO_URING_F_UNLOCKED)) { entry = io_alloc_cache_get(&ctx->netmsg_cache); if (entry) { hdr = container_of(entry, struct io_async_msghdr, cache); hdr->free_iov = NULL; req->flags |= REQ_F_ASYNC_DATA; req->async_data = hdr; return hdr; } } if (!io_alloc_async_data(req)) { hdr = req->async_data; hdr->free_iov = NULL; return hdr; } return NULL; } static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req) { /* ->prep_async is always called from the submission context */ return io_msg_alloc_async(req, 0); } static int io_setup_async_msg(struct io_kiocb *req, struct io_async_msghdr *kmsg, unsigned int issue_flags) { struct io_async_msghdr *async_msg; if (req_has_async_data(req)) return -EAGAIN; async_msg = io_msg_alloc_async(req, issue_flags); if (!async_msg) { kfree(kmsg->free_iov); return -ENOMEM; } req->flags |= REQ_F_NEED_CLEANUP; memcpy(async_msg, kmsg, sizeof(*kmsg)); if (async_msg->msg.msg_name) async_msg->msg.msg_name = &async_msg->addr; if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs) return -EAGAIN; /* if were using fast_iov, set it to the new one */ if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx]; } return -EAGAIN; } static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); int ret; iomsg->msg.msg_name = &iomsg->addr; iomsg->free_iov = iomsg->fast_iov; ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, &iomsg->free_iov); /* save msg_control as sys_sendmsg() overwrites it */ sr->msg_control = iomsg->msg.msg_control_user; return ret; } int io_send_prep_async(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *io; int ret; if (!zc->addr || req_has_async_data(req)) return 0; io = io_msg_alloc_async_prep(req); if (!io) return -ENOMEM; ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr); return ret; } static int io_setup_async_addr(struct io_kiocb *req, struct sockaddr_storage *addr_storage, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *io; if (!sr->addr || req_has_async_data(req)) return -EAGAIN; io = io_msg_alloc_async(req, issue_flags); if (!io) return -ENOMEM; memcpy(&io->addr, addr_storage, sizeof(io->addr)); return -EAGAIN; } int io_sendmsg_prep_async(struct io_kiocb *req) { int ret; if (!io_msg_alloc_async_prep(req)) return -ENOMEM; ret = io_sendmsg_copy_hdr(req, req->async_data); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) { struct io_async_msghdr *io = req->async_data; kfree(io->free_iov); } int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); if (req->opcode == IORING_OP_SEND) { if (READ_ONCE(sqe->__pad3[0])) return -EINVAL; sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); sr->addr_len = READ_ONCE(sqe->addr_len); } else if (sqe->addr2 || sqe->file_index) { return -EINVAL; } sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; #ifdef CONFIG_COMPAT if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif sr->done_io = 0; return 0; } int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr iomsg, *kmsg; struct socket *sock; unsigned flags; int min_ret = 0; int ret; sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; if (req_has_async_data(req)) { kmsg = req->async_data; kmsg->msg.msg_control_user = sr->msg_control; } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) return ret; kmsg = &iomsg; } if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg, issue_flags); flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return io_setup_async_msg(req, kmsg, issue_flags); if (ret > 0 && io_net_retry(sock, flags)) { kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; return io_setup_async_msg(req, kmsg, issue_flags); } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; io_netmsg_recycle(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; io_req_set_res(req, ret, 0); return IOU_OK; } int io_send(struct io_kiocb *req, unsigned int issue_flags) { struct sockaddr_storage __address; struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; struct socket *sock; unsigned flags; int min_ret = 0; int ret; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; msg.msg_ubuf = NULL; if (sr->addr) { if (req_has_async_data(req)) { struct io_async_msghdr *io = req->async_data; msg.msg_name = &io->addr; } else { ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address); if (unlikely(ret < 0)) return ret; msg.msg_name = (struct sockaddr *)&__address; } msg.msg_namelen = sr->addr_len; } if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_addr(req, &__address, issue_flags); sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter); if (unlikely(ret)) return ret; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&msg.msg_iter); flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg.msg_flags = flags; ret = sock_sendmsg(sock, &msg); if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return io_setup_async_addr(req, &__address, issue_flags); if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; return io_setup_async_addr(req, &__address, issue_flags); } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } if (ret >= 0) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; io_req_set_res(req, ret, 0); return IOU_OK; } static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg) { int hdr; if (iomsg->namelen < 0) return true; if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out), iomsg->namelen, &hdr)) return true; if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr)) return true; return false; } static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct user_msghdr msg; int ret; if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg))) return -EFAULT; ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); if (ret) return ret; if (req->flags & REQ_F_BUFFER_SELECT) { if (msg.msg_iovlen == 0) { sr->len = iomsg->fast_iov[0].iov_len = 0; iomsg->fast_iov[0].iov_base = NULL; iomsg->free_iov = NULL; } else if (msg.msg_iovlen > 1) { return -EINVAL; } else { if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov))) return -EFAULT; sr->len = iomsg->fast_iov[0].iov_len; iomsg->free_iov = NULL; } if (req->flags & REQ_F_APOLL_MULTISHOT) { iomsg->namelen = msg.msg_namelen; iomsg->controllen = msg.msg_controllen; if (io_recvmsg_multishot_overflow(iomsg)) return -EOVERFLOW; } } else { iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(ITER_DEST, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, &iomsg->free_iov, &iomsg->msg.msg_iter, false); if (ret > 0) ret = 0; } return ret; } #ifdef CONFIG_COMPAT static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct compat_msghdr msg; struct compat_iovec __user *uiov; int ret; if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg))) return -EFAULT; ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); if (ret) return ret; uiov = compat_ptr(msg.msg_iov); if (req->flags & REQ_F_BUFFER_SELECT) { compat_ssize_t clen; iomsg->free_iov = NULL; if (msg.msg_iovlen == 0) { sr->len = 0; } else if (msg.msg_iovlen > 1) { return -EINVAL; } else { if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(clen, &uiov->iov_len)) return -EFAULT; if (clen < 0) return -EINVAL; sr->len = clen; } if (req->flags & REQ_F_APOLL_MULTISHOT) { iomsg->namelen = msg.msg_namelen; iomsg->controllen = msg.msg_controllen; if (io_recvmsg_multishot_overflow(iomsg)) return -EOVERFLOW; } } else { iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(ITER_DEST, (struct iovec __user *)uiov, msg.msg_iovlen, UIO_FASTIOV, &iomsg->free_iov, &iomsg->msg.msg_iter, true); if (ret < 0) return ret; } return 0; } #endif static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { iomsg->msg.msg_name = &iomsg->addr; iomsg->msg.msg_iter.nr_segs = 0; #ifdef CONFIG_COMPAT if (req->ctx->compat) return __io_compat_recvmsg_copy_hdr(req, iomsg); #endif return __io_recvmsg_copy_hdr(req, iomsg); } int io_recvmsg_prep_async(struct io_kiocb *req) { int ret; if (!io_msg_alloc_async_prep(req)) return -ENOMEM; ret = io_recvmsg_copy_hdr(req, req->async_data); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT) int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~(RECVMSG_FLAGS)) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; if (sr->flags & IORING_RECV_MULTISHOT) { if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sr->msg_flags & MSG_WAITALL) return -EINVAL; if (req->opcode == IORING_OP_RECV && sr->len) return -EINVAL; req->flags |= REQ_F_APOLL_MULTISHOT; /* * Store the buffer group for this multishot receive separately, * as if we end up doing an io-wq based issue that selects a * buffer, it has to be committed immediately and that will * clear ->buf_list. This means we lose the link to the buffer * list, and the eventual buffer put on completion then cannot * restore it. */ sr->buf_group = req->buf_index; } #ifdef CONFIG_COMPAT if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif sr->done_io = 0; sr->nr_multishot_loops = 0; return 0; } static inline void io_recv_prep_retry(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; sr->len = 0; /* get from the provided buffer */ req->buf_index = sr->buf_group; } /* * Finishes io_recv and io_recvmsg. * * Returns true if it is actually finished, or false if it should run * again (for multishot). */ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, struct msghdr *msg, bool mshot_finished, unsigned issue_flags) { unsigned int cflags; cflags = io_put_kbuf(req, issue_flags); if (msg->msg_inq && msg->msg_inq != -1) cflags |= IORING_CQE_F_SOCK_NONEMPTY; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { io_req_set_res(req, *ret, cflags); *ret = IOU_OK; return true; } if (mshot_finished) goto finish; /* * Fill CQE for this receive and see if we should keep trying to * receive from this socket. */ if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, *ret, cflags | IORING_CQE_F_MORE)) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; io_recv_prep_retry(req); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) { if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) return false; /* mshot retries exceeded, force a requeue */ sr->nr_multishot_loops = 0; mshot_retry_ret = IOU_REQUEUE; } if (issue_flags & IO_URING_F_MULTISHOT) *ret = mshot_retry_ret; else *ret = -EAGAIN; return true; } /* Otherwise stop multishot but use the current result. */ finish: io_req_set_res(req, *ret, cflags); if (issue_flags & IO_URING_F_MULTISHOT) *ret = IOU_STOP_MULTISHOT; else *ret = IOU_OK; return true; } static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg, struct io_sr_msg *sr, void __user **buf, size_t *len) { unsigned long ubuf = (unsigned long) *buf; unsigned long hdr; hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + kmsg->controllen; if (*len < hdr) return -EFAULT; if (kmsg->controllen) { unsigned long control = ubuf + hdr - kmsg->controllen; kmsg->msg.msg_control_user = (void __user *) control; kmsg->msg.msg_controllen = kmsg->controllen; } sr->buf = *buf; /* stash for later copy */ *buf = (void __user *) (ubuf + hdr); kmsg->payloadlen = *len = *len - hdr; return 0; } struct io_recvmsg_multishot_hdr { struct io_uring_recvmsg_out msg; struct sockaddr_storage addr; }; static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, struct io_async_msghdr *kmsg, unsigned int flags, bool *finished) { int err; int copy_len; struct io_recvmsg_multishot_hdr hdr; if (kmsg->namelen) kmsg->msg.msg_name = &hdr.addr; kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); kmsg->msg.msg_namelen = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &kmsg->msg, flags); *finished = err <= 0; if (err < 0) return err; hdr.msg = (struct io_uring_recvmsg_out) { .controllen = kmsg->controllen - kmsg->msg.msg_controllen, .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT }; hdr.msg.payloadlen = err; if (err > kmsg->payloadlen) err = kmsg->payloadlen; copy_len = sizeof(struct io_uring_recvmsg_out); if (kmsg->msg.msg_namelen > kmsg->namelen) copy_len += kmsg->namelen; else copy_len += kmsg->msg.msg_namelen; /* * "fromlen shall refer to the value before truncation.." * 1003.1g */ hdr.msg.namelen = kmsg->msg.msg_namelen; /* ensure that there is no gap between hdr and sockaddr_storage */ BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) != sizeof(struct io_uring_recvmsg_out)); if (copy_to_user(io->buf, &hdr, copy_len)) { *finished = true; return -EFAULT; } return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + kmsg->controllen + err; } int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr iomsg, *kmsg; struct socket *sock; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; bool mshot_finished = true; sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; if (req_has_async_data(req)) { kmsg = req->async_data; } else { ret = io_recvmsg_copy_hdr(req, &iomsg); if (ret) return ret; kmsg = &iomsg; } if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg, issue_flags); if (!io_check_multishot(req, issue_flags)) return io_setup_async_msg(req, kmsg, issue_flags); retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; size_t len = sr->len; buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; if (req->flags & REQ_F_APOLL_MULTISHOT) { ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); if (ret) { io_kbuf_recycle(req, issue_flags); return ret; } } iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); } flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; kmsg->msg.msg_get_inq = 1; kmsg->msg.msg_inq = -1; if (req->flags & REQ_F_APOLL_MULTISHOT) { ret = io_recvmsg_multishot(sock, sr, kmsg, flags, &mshot_finished); } else { /* disable partial retry for recvmsg with cmsg attached */ if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) min_ret = iov_iter_count(&kmsg->msg.msg_iter); ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); } if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { ret = io_setup_async_msg(req, kmsg, issue_flags); if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) { io_kbuf_recycle(req, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } return ret; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; return io_setup_async_msg(req, kmsg, issue_flags); } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { req_set_fail(req); } if (ret > 0) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; else io_kbuf_recycle(req, issue_flags); if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) goto retry_multishot; if (mshot_finished) { /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); io_netmsg_recycle(req, issue_flags); req->flags &= ~REQ_F_NEED_CLEANUP; } return ret; } int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; struct socket *sock; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; size_t len = sr->len; if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; if (!io_check_multishot(req, issue_flags)) return -EAGAIN; sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; msg.msg_name = NULL; msg.msg_namelen = 0; msg.msg_control = NULL; msg.msg_get_inq = 1; msg.msg_controllen = 0; msg.msg_iocb = NULL; msg.msg_ubuf = NULL; retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; sr->buf = buf; sr->len = len; } ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter); if (unlikely(ret)) goto out_free; msg.msg_inq = -1; msg.msg_flags = 0; flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&msg.msg_iter); ret = sock_recvmsg(sock, &msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { if (issue_flags & IO_URING_F_MULTISHOT) { io_kbuf_recycle(req, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } return -EAGAIN; } if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: req_set_fail(req); } if (ret > 0) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; else io_kbuf_recycle(req, issue_flags); if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags)) goto retry_multishot; return ret; } void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *io; if (req_has_async_data(req)) { io = req->async_data; /* might be ->fast_iov if *msg_copy_hdr failed */ if (io->free_iov != io->fast_iov) kfree(io->free_iov); } if (zc->notif) { io_notif_flush(zc->notif); zc->notif = NULL; } } #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) #define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *notif; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ if (req->flags & REQ_F_CQE_SKIP) return -EINVAL; notif = zc->notif = io_alloc_notif(ctx); if (!notif) return -ENOMEM; notif->cqe.user_data = req->cqe.user_data; notif->cqe.res = 0; notif->cqe.flags = IORING_CQE_F_NOTIF; req->flags |= REQ_F_NEED_CLEANUP; zc->flags = READ_ONCE(sqe->ioprio); if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { if (zc->flags & ~IO_ZC_FLAGS_VALID) return -EINVAL; if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { io_notif_set_extended(notif); io_notif_to_data(notif)->zc_report = true; } } if (zc->flags & IORING_RECVSEND_FIXED_BUF) { unsigned idx = READ_ONCE(sqe->buf_index); if (unlikely(idx >= ctx->nr_user_bufs)) return -EFAULT; idx = array_index_nospec(idx, ctx->nr_user_bufs); req->imu = READ_ONCE(ctx->user_bufs[idx]); io_req_set_rsrc_node(notif, ctx, 0); } if (req->opcode == IORING_OP_SEND_ZC) { if (READ_ONCE(sqe->__pad3[0])) return -EINVAL; zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); zc->addr_len = READ_ONCE(sqe->addr_len); } else { if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) return -EINVAL; } zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; zc->done_io = 0; #ifdef CONFIG_COMPAT if (req->ctx->compat) zc->msg_flags |= MSG_CMSG_COMPAT; #endif return 0; } static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length) { skb_zcopy_downgrade_managed(skb); return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); } static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length) { struct skb_shared_info *shinfo = skb_shinfo(skb); int frag = shinfo->nr_frags; int ret = 0; struct bvec_iter bi; ssize_t copied = 0; unsigned long truesize = 0; if (!frag) shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; else if (unlikely(!skb_zcopy_managed(skb))) return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); bi.bi_size = min(from->count, length); bi.bi_bvec_done = from->iov_offset; bi.bi_idx = 0; while (bi.bi_size && frag < MAX_SKB_FRAGS) { struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); copied += v.bv_len; truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, v.bv_offset, v.bv_len); bvec_iter_advance_single(from->bvec, &bi, v.bv_len); } if (bi.bi_size) ret = -EMSGSIZE; shinfo->nr_frags = frag; from->bvec += bi.bi_idx; from->nr_segs -= bi.bi_idx; from->count -= copied; from->iov_offset = bi.bi_bvec_done; skb->data_len += copied; skb->len += copied; skb->truesize += truesize; if (sk && sk->sk_type == SOCK_STREAM) { sk_wmem_queued_add(sk, truesize); if (!skb_zcopy_pure(skb)) sk_mem_charge(sk, truesize); } else { refcount_add(truesize, &skb->sk->sk_wmem_alloc); } return ret; } int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) { struct sockaddr_storage __address; struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; struct socket *sock; unsigned msg_flags; int ret, min_ret = 0; sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; msg.msg_name = NULL; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; if (zc->addr) { if (req_has_async_data(req)) { struct io_async_msghdr *io = req->async_data; msg.msg_name = &io->addr; } else { ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address); if (unlikely(ret < 0)) return ret; msg.msg_name = (struct sockaddr *)&__address; } msg.msg_namelen = zc->addr_len; } if (!(req->flags & REQ_F_POLLED) && (zc->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_addr(req, &__address, issue_flags); if (zc->flags & IORING_RECVSEND_FIXED_BUF) { ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu, (u64)(uintptr_t)zc->buf, zc->len); if (unlikely(ret)) return ret; msg.sg_from_iter = io_sg_from_iter; } else { io_notif_set_extended(zc->notif); ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter); if (unlikely(ret)) return ret; ret = io_notif_account_mem(zc->notif, zc->len); if (unlikely(ret)) return ret; msg.sg_from_iter = io_sg_from_iter_iovec; } msg_flags = zc->msg_flags | MSG_ZEROCOPY; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) min_ret = iov_iter_count(&msg.msg_iter); msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg.msg_flags = msg_flags; msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; ret = sock_sendmsg(sock, &msg); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return io_setup_async_addr(req, &__address, issue_flags); if (ret > 0 && io_net_retry(sock, msg.msg_flags)) { zc->len -= ret; zc->buf += ret; zc->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; return io_setup_async_addr(req, &__address, issue_flags); } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } if (ret >= 0) ret += zc->done_io; else if (zc->done_io) ret = zc->done_io; /* * If we're in io-wq we can't rely on tw ordering guarantees, defer * flushing notif to io_send_zc_cleanup() */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(zc->notif); req->flags &= ~REQ_F_NEED_CLEANUP; } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; } int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr iomsg, *kmsg; struct socket *sock; unsigned flags; int ret, min_ret = 0; io_notif_set_extended(sr->notif); sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; if (req_has_async_data(req)) { kmsg = req->async_data; } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) return ret; kmsg = &iomsg; } if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg, issue_flags); flags = sr->msg_flags | MSG_ZEROCOPY; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return io_setup_async_msg(req, kmsg, issue_flags); if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; return io_setup_async_msg(req, kmsg, issue_flags); } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) { kfree(kmsg->free_iov); kmsg->free_iov = NULL; } io_netmsg_recycle(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; /* * If we're in io-wq we can't rely on tw ordering guarantees, defer * flushing notif to io_send_zc_cleanup() */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(sr->notif); req->flags &= ~REQ_F_NEED_CLEANUP; } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; } void io_sendrecv_fail(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); if (req->flags & REQ_F_PARTIAL_IO) req->cqe.res = sr->done_io; if ((req->flags & REQ_F_NEED_CLEANUP) && (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) req->cqe.flags |= IORING_CQE_F_MORE; } int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); unsigned flags; if (sqe->len || sqe->buf_index) return -EINVAL; accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); flags = READ_ONCE(sqe->ioprio); if (flags & ~IORING_ACCEPT_MULTISHOT) return -EINVAL; accept->file_slot = READ_ONCE(sqe->file_index); if (accept->file_slot) { if (accept->flags & SOCK_CLOEXEC) return -EINVAL; if (flags & IORING_ACCEPT_MULTISHOT && accept->file_slot != IORING_FILE_INDEX_ALLOC) return -EINVAL; } if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; if (flags & IORING_ACCEPT_MULTISHOT) req->flags |= REQ_F_APOLL_MULTISHOT; return 0; } int io_accept(struct io_kiocb *req, unsigned int issue_flags) { struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; bool fixed = !!accept->file_slot; struct file *file; int ret, fd; if (!io_check_multishot(req, issue_flags)) return -EAGAIN; retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); if (unlikely(fd < 0)) return fd; } file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, accept->flags); if (IS_ERR(file)) { if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); if (ret == -EAGAIN && force_nonblock) { /* * if it's multishot and polled, we don't need to * return EAGAIN to arm the poll infra since it * has already been done */ if (issue_flags & IO_URING_F_MULTISHOT) return IOU_ISSUE_SKIP_COMPLETE; return ret; } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } else if (!fixed) { fd_install(fd, file); ret = fd; } else { ret = io_fixed_fd_install(req, issue_flags, file, accept->file_slot); } if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { io_req_set_res(req, ret, 0); return IOU_OK; } if (ret < 0) return ret; if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret, IORING_CQE_F_MORE)) goto retry; io_req_set_res(req, ret, 0); return IOU_STOP_MULTISHOT; } int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); if (sqe->addr || sqe->rw_flags || sqe->buf_index) return -EINVAL; sock->domain = READ_ONCE(sqe->fd); sock->type = READ_ONCE(sqe->off); sock->protocol = READ_ONCE(sqe->len); sock->file_slot = READ_ONCE(sqe->file_index); sock->nofile = rlimit(RLIMIT_NOFILE); sock->flags = sock->type & ~SOCK_TYPE_MASK; if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) return -EINVAL; if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; return 0; } int io_socket(struct io_kiocb *req, unsigned int issue_flags) { struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); bool fixed = !!sock->file_slot; struct file *file; int ret, fd; if (!fixed) { fd = __get_unused_fd_flags(sock->flags, sock->nofile); if (unlikely(fd < 0)) return fd; } file = __sys_socket_file(sock->domain, sock->type, sock->protocol); if (IS_ERR(file)) { if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); } else if (!fixed) { fd_install(fd, file); ret = fd; } else { ret = io_fixed_fd_install(req, issue_flags, file, sock->file_slot); } io_req_set_res(req, ret, 0); return IOU_OK; } int io_connect_prep_async(struct io_kiocb *req) { struct io_async_connect *io = req->async_data; struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); } int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); conn->addr_len = READ_ONCE(sqe->addr2); conn->in_progress = conn->seen_econnaborted = false; return 0; } int io_connect(struct io_kiocb *req, unsigned int issue_flags) { struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); struct io_async_connect __io, *io; unsigned file_flags; int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; if (req_has_async_data(req)) { io = req->async_data; } else { ret = move_addr_to_kernel(connect->addr, connect->addr_len, &__io.address); if (ret) goto out; io = &__io; } file_flags = force_nonblock ? O_NONBLOCK : 0; ret = __sys_connect_file(req->file, &io->address, connect->addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) && force_nonblock) { if (ret == -EINPROGRESS) { connect->in_progress = true; } else if (ret == -ECONNABORTED) { if (connect->seen_econnaborted) goto out; connect->seen_econnaborted = true; } if (req_has_async_data(req)) return -EAGAIN; if (io_alloc_async_data(req)) { ret = -ENOMEM; goto out; } memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } if (connect->in_progress) { /* * At least bluetooth will return -EBADFD on a re-connect * attempt, and it's (supposedly) also valid to get -EISCONN * which means the previous result is good. For both of these, * grab the sock_error() and use that for the completion. */ if (ret == -EBADFD || ret == -EISCONN) ret = sock_error(sock_from_file(req->file)->sk); } if (ret == -ERESTARTSYS) ret = -EINTR; out: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; } void io_netmsg_cache_free(struct io_cache_entry *entry) { kfree(container_of(entry, struct io_async_msghdr, cache)); } #endif
3 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 // SPDX-License-Identifier: GPL-2.0-or-later /* * uvc_ctrl.c -- USB Video Class driver - Controls * * Copyright (C) 2005-2010 * Laurent Pinchart (laurent.pinchart@ideasonboard.com) */ #include <asm/barrier.h> #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/uvc.h> #include <linux/videodev2.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <linux/atomic.h> #include <media/v4l2-ctrls.h> #include "uvcvideo.h" #define UVC_CTRL_DATA_CURRENT 0 #define UVC_CTRL_DATA_BACKUP 1 #define UVC_CTRL_DATA_MIN 2 #define UVC_CTRL_DATA_MAX 3 #define UVC_CTRL_DATA_RES 4 #define UVC_CTRL_DATA_DEF 5 #define UVC_CTRL_DATA_LAST 6 /* ------------------------------------------------------------------------ * Controls */ static const struct uvc_control_info uvc_ctrls[] = { { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_BRIGHTNESS_CONTROL, .index = 0, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_CONTRAST_CONTROL, .index = 1, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_HUE_CONTROL, .index = 2, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_SATURATION_CONTROL, .index = 3, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_SHARPNESS_CONTROL, .index = 4, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_GAMMA_CONTROL, .index = 5, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_WHITE_BALANCE_TEMPERATURE_CONTROL, .index = 6, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_WHITE_BALANCE_COMPONENT_CONTROL, .index = 7, .size = 4, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_BACKLIGHT_COMPENSATION_CONTROL, .index = 8, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_GAIN_CONTROL, .index = 9, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_POWER_LINE_FREQUENCY_CONTROL, .index = 10, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_HUE_AUTO_CONTROL, .index = 11, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_WHITE_BALANCE_TEMPERATURE_AUTO_CONTROL, .index = 12, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_WHITE_BALANCE_COMPONENT_AUTO_CONTROL, .index = 13, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_DIGITAL_MULTIPLIER_CONTROL, .index = 14, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_DIGITAL_MULTIPLIER_LIMIT_CONTROL, .index = 15, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_ANALOG_VIDEO_STANDARD_CONTROL, .index = 16, .size = 1, .flags = UVC_CTRL_FLAG_GET_CUR, }, { .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_ANALOG_LOCK_STATUS_CONTROL, .index = 17, .size = 1, .flags = UVC_CTRL_FLAG_GET_CUR, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_SCANNING_MODE_CONTROL, .index = 0, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_AE_MODE_CONTROL, .index = 1, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_GET_RES | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_AE_PRIORITY_CONTROL, .index = 2, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_EXPOSURE_TIME_ABSOLUTE_CONTROL, .index = 3, .size = 4, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_EXPOSURE_TIME_RELATIVE_CONTROL, .index = 4, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_FOCUS_ABSOLUTE_CONTROL, .index = 5, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_FOCUS_RELATIVE_CONTROL, .index = 6, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_MIN | UVC_CTRL_FLAG_GET_MAX | UVC_CTRL_FLAG_GET_RES | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_IRIS_ABSOLUTE_CONTROL, .index = 7, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_IRIS_RELATIVE_CONTROL, .index = 8, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_ZOOM_ABSOLUTE_CONTROL, .index = 9, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_ZOOM_RELATIVE_CONTROL, .index = 10, .size = 3, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_MIN | UVC_CTRL_FLAG_GET_MAX | UVC_CTRL_FLAG_GET_RES | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_PANTILT_ABSOLUTE_CONTROL, .index = 11, .size = 8, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_PANTILT_RELATIVE_CONTROL, .index = 12, .size = 4, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_ROLL_ABSOLUTE_CONTROL, .index = 13, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_RANGE | UVC_CTRL_FLAG_RESTORE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_ROLL_RELATIVE_CONTROL, .index = 14, .size = 2, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_MIN | UVC_CTRL_FLAG_GET_MAX | UVC_CTRL_FLAG_GET_RES | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_FOCUS_AUTO_CONTROL, .index = 17, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_RESTORE, }, { .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_PRIVACY_CONTROL, .index = 18, .size = 1, .flags = UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_RESTORE | UVC_CTRL_FLAG_AUTO_UPDATE, }, { .entity = UVC_GUID_EXT_GPIO_CONTROLLER, .selector = UVC_CT_PRIVACY_CONTROL, .index = 0, .size = 1, .flags = UVC_CTRL_FLAG_GET_CUR | UVC_CTRL_FLAG_AUTO_UPDATE, }, }; static const u32 uvc_control_classes[] = { V4L2_CID_CAMERA_CLASS, V4L2_CID_USER_CLASS, }; static const int exposure_auto_mapping[] = { 2, 1, 4, 8 }; /* * This function translates the V4L2 menu index @idx, as exposed to userspace as * the V4L2 control value, to the corresponding UVC control value used by the * device. The custom menu_mapping in the control @mapping is used when * available, otherwise the function assumes that the V4L2 and UVC values are * identical. * * For controls of type UVC_CTRL_DATA_TYPE_BITMASK, the UVC control value is * expressed as a bitmask and is thus guaranteed to have a single bit set. * * The function returns -EINVAL if the V4L2 menu index @idx isn't valid for the * control, which includes all controls whose type isn't UVC_CTRL_DATA_TYPE_ENUM * or UVC_CTRL_DATA_TYPE_BITMASK. */ static int uvc_mapping_get_menu_value(const struct uvc_control_mapping *mapping, u32 idx) { if (!test_bit(idx, &mapping->menu_mask)) return -EINVAL; if (mapping->menu_mapping) return mapping->menu_mapping[idx]; return idx; } static const char * uvc_mapping_get_menu_name(const struct uvc_control_mapping *mapping, u32 idx) { if (!test_bit(idx, &mapping->menu_mask)) return NULL; if (mapping->menu_names) return mapping->menu_names[idx]; return v4l2_ctrl_get_menu(mapping->id)[idx]; } static s32 uvc_ctrl_get_zoom(struct uvc_control_mapping *mapping, u8 query, const u8 *data) { s8 zoom = (s8)data[0]; switch (query) { case UVC_GET_CUR: return (zoom == 0) ? 0 : (zoom > 0 ? data[2] : -data[2]); case UVC_GET_MIN: case UVC_GET_MAX: case UVC_GET_RES: case UVC_GET_DEF: default: return data[2]; } } static void uvc_ctrl_set_zoom(struct uvc_control_mapping *mapping, s32 value, u8 *data) { data[0] = value == 0 ? 0 : (value > 0) ? 1 : 0xff; data[2] = min((int)abs(value), 0xff); } static s32 uvc_ctrl_get_rel_speed(struct uvc_control_mapping *mapping, u8 query, const u8 *data) { unsigned int first = mapping->offset / 8; s8 rel = (s8)data[first]; switch (query) { case UVC_GET_CUR: return (rel == 0) ? 0 : (rel > 0 ? data[first+1] : -data[first+1]); case UVC_GET_MIN: return -data[first+1]; case UVC_GET_MAX: case UVC_GET_RES: case UVC_GET_DEF: default: return data[first+1]; } } static void uvc_ctrl_set_rel_speed(struct uvc_control_mapping *mapping, s32 value, u8 *data) { unsigned int first = mapping->offset / 8; data[first] = value == 0 ? 0 : (value > 0) ? 1 : 0xff; data[first+1] = min_t(int, abs(value), 0xff); } static const struct uvc_control_mapping uvc_ctrl_mappings[] = { { .id = V4L2_CID_BRIGHTNESS, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_BRIGHTNESS_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_SIGNED, }, { .id = V4L2_CID_CONTRAST, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_CONTRAST_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, }, { .id = V4L2_CID_HUE, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_HUE_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_SIGNED, .master_id = V4L2_CID_HUE_AUTO, .master_manual = 0, }, { .id = V4L2_CID_SATURATION, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_SATURATION_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, }, { .id = V4L2_CID_SHARPNESS, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_SHARPNESS_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, }, { .id = V4L2_CID_GAMMA, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_GAMMA_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, }, { .id = V4L2_CID_BACKLIGHT_COMPENSATION, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_BACKLIGHT_COMPENSATION_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, }, { .id = V4L2_CID_GAIN, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_GAIN_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, }, { .id = V4L2_CID_HUE_AUTO, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_HUE_AUTO_CONTROL, .size = 1, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_BOOLEAN, .data_type = UVC_CTRL_DATA_TYPE_BOOLEAN, .slave_ids = { V4L2_CID_HUE, }, }, { .id = V4L2_CID_EXPOSURE_AUTO, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_AE_MODE_CONTROL, .size = 4, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_MENU, .data_type = UVC_CTRL_DATA_TYPE_BITMASK, .menu_mapping = exposure_auto_mapping, .menu_mask = GENMASK(V4L2_EXPOSURE_APERTURE_PRIORITY, V4L2_EXPOSURE_AUTO), .slave_ids = { V4L2_CID_EXPOSURE_ABSOLUTE, }, }, { .id = V4L2_CID_EXPOSURE_AUTO_PRIORITY, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_AE_PRIORITY_CONTROL, .size = 1, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_BOOLEAN, .data_type = UVC_CTRL_DATA_TYPE_BOOLEAN, }, { .id = V4L2_CID_EXPOSURE_ABSOLUTE, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_EXPOSURE_TIME_ABSOLUTE_CONTROL, .size = 32, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, .master_id = V4L2_CID_EXPOSURE_AUTO, .master_manual = V4L2_EXPOSURE_MANUAL, }, { .id = V4L2_CID_AUTO_WHITE_BALANCE, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_WHITE_BALANCE_TEMPERATURE_AUTO_CONTROL, .size = 1, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_BOOLEAN, .data_type = UVC_CTRL_DATA_TYPE_BOOLEAN, .slave_ids = { V4L2_CID_WHITE_BALANCE_TEMPERATURE, }, }, { .id = V4L2_CID_WHITE_BALANCE_TEMPERATURE, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_WHITE_BALANCE_TEMPERATURE_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, .master_id = V4L2_CID_AUTO_WHITE_BALANCE, .master_manual = 0, }, { .id = V4L2_CID_AUTO_WHITE_BALANCE, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_WHITE_BALANCE_COMPONENT_AUTO_CONTROL, .size = 1, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_BOOLEAN, .data_type = UVC_CTRL_DATA_TYPE_BOOLEAN, .slave_ids = { V4L2_CID_BLUE_BALANCE, V4L2_CID_RED_BALANCE }, }, { .id = V4L2_CID_BLUE_BALANCE, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_WHITE_BALANCE_COMPONENT_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_SIGNED, .master_id = V4L2_CID_AUTO_WHITE_BALANCE, .master_manual = 0, }, { .id = V4L2_CID_RED_BALANCE, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_WHITE_BALANCE_COMPONENT_CONTROL, .size = 16, .offset = 16, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_SIGNED, .master_id = V4L2_CID_AUTO_WHITE_BALANCE, .master_manual = 0, }, { .id = V4L2_CID_FOCUS_ABSOLUTE, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_FOCUS_ABSOLUTE_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, .master_id = V4L2_CID_FOCUS_AUTO, .master_manual = 0, }, { .id = V4L2_CID_FOCUS_AUTO, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_FOCUS_AUTO_CONTROL, .size = 1, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_BOOLEAN, .data_type = UVC_CTRL_DATA_TYPE_BOOLEAN, .slave_ids = { V4L2_CID_FOCUS_ABSOLUTE, }, }, { .id = V4L2_CID_IRIS_ABSOLUTE, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_IRIS_ABSOLUTE_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, }, { .id = V4L2_CID_IRIS_RELATIVE, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_IRIS_RELATIVE_CONTROL, .size = 8, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_SIGNED, }, { .id = V4L2_CID_ZOOM_ABSOLUTE, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_ZOOM_ABSOLUTE_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_UNSIGNED, }, { .id = V4L2_CID_ZOOM_CONTINUOUS, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_ZOOM_RELATIVE_CONTROL, .size = 0, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_SIGNED, .get = uvc_ctrl_get_zoom, .set = uvc_ctrl_set_zoom, }, { .id = V4L2_CID_PAN_ABSOLUTE, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_PANTILT_ABSOLUTE_CONTROL, .size = 32, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_SIGNED, }, { .id = V4L2_CID_TILT_ABSOLUTE, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_PANTILT_ABSOLUTE_CONTROL, .size = 32, .offset = 32, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_SIGNED, }, { .id = V4L2_CID_PAN_SPEED, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_PANTILT_RELATIVE_CONTROL, .size = 16, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_SIGNED, .get = uvc_ctrl_get_rel_speed, .set = uvc_ctrl_set_rel_speed, }, { .id = V4L2_CID_TILT_SPEED, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_PANTILT_RELATIVE_CONTROL, .size = 16, .offset = 16, .v4l2_type = V4L2_CTRL_TYPE_INTEGER, .data_type = UVC_CTRL_DATA_TYPE_SIGNED, .get = uvc_ctrl_get_rel_speed, .set = uvc_ctrl_set_rel_speed, }, { .id = V4L2_CID_PRIVACY, .entity = UVC_GUID_UVC_CAMERA, .selector = UVC_CT_PRIVACY_CONTROL, .size = 1, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_BOOLEAN, .data_type = UVC_CTRL_DATA_TYPE_BOOLEAN, }, { .id = V4L2_CID_PRIVACY, .entity = UVC_GUID_EXT_GPIO_CONTROLLER, .selector = UVC_CT_PRIVACY_CONTROL, .size = 1, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_BOOLEAN, .data_type = UVC_CTRL_DATA_TYPE_BOOLEAN, }, }; const struct uvc_control_mapping uvc_ctrl_power_line_mapping_limited = { .id = V4L2_CID_POWER_LINE_FREQUENCY, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_POWER_LINE_FREQUENCY_CONTROL, .size = 2, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_MENU, .data_type = UVC_CTRL_DATA_TYPE_ENUM, .menu_mask = GENMASK(V4L2_CID_POWER_LINE_FREQUENCY_60HZ, V4L2_CID_POWER_LINE_FREQUENCY_50HZ), }; const struct uvc_control_mapping uvc_ctrl_power_line_mapping_uvc11 = { .id = V4L2_CID_POWER_LINE_FREQUENCY, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_POWER_LINE_FREQUENCY_CONTROL, .size = 2, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_MENU, .data_type = UVC_CTRL_DATA_TYPE_ENUM, .menu_mask = GENMASK(V4L2_CID_POWER_LINE_FREQUENCY_60HZ, V4L2_CID_POWER_LINE_FREQUENCY_DISABLED), }; static const struct uvc_control_mapping *uvc_ctrl_mappings_uvc11[] = { &uvc_ctrl_power_line_mapping_uvc11, NULL, /* Sentinel */ }; static const struct uvc_control_mapping uvc_ctrl_power_line_mapping_uvc15 = { .id = V4L2_CID_POWER_LINE_FREQUENCY, .entity = UVC_GUID_UVC_PROCESSING, .selector = UVC_PU_POWER_LINE_FREQUENCY_CONTROL, .size = 2, .offset = 0, .v4l2_type = V4L2_CTRL_TYPE_MENU, .data_type = UVC_CTRL_DATA_TYPE_ENUM, .menu_mask = GENMASK(V4L2_CID_POWER_LINE_FREQUENCY_AUTO, V4L2_CID_POWER_LINE_FREQUENCY_DISABLED), }; static const struct uvc_control_mapping *uvc_ctrl_mappings_uvc15[] = { &uvc_ctrl_power_line_mapping_uvc15, NULL, /* Sentinel */ }; /* ------------------------------------------------------------------------ * Utility functions */ static inline u8 *uvc_ctrl_data(struct uvc_control *ctrl, int id) { return ctrl->uvc_data + id * ctrl->info.size; } static inline int uvc_test_bit(const u8 *data, int bit) { return (data[bit >> 3] >> (bit & 7)) & 1; } static inline void uvc_clear_bit(u8 *data, int bit) { data[bit >> 3] &= ~(1 << (bit & 7)); } /* * Extract the bit string specified by mapping->offset and mapping->size * from the little-endian data stored at 'data' and return the result as * a signed 32bit integer. Sign extension will be performed if the mapping * references a signed data type. */ static s32 uvc_get_le_value(struct uvc_control_mapping *mapping, u8 query, const u8 *data) { int bits = mapping->size; int offset = mapping->offset; s32 value = 0; u8 mask; data += offset / 8; offset &= 7; mask = ((1LL << bits) - 1) << offset; while (1) { u8 byte = *data & mask; value |= offset > 0 ? (byte >> offset) : (byte << (-offset)); bits -= 8 - (offset > 0 ? offset : 0); if (bits <= 0) break; offset -= 8; mask = (1 << bits) - 1; data++; } /* Sign-extend the value if needed. */ if (mapping->data_type == UVC_CTRL_DATA_TYPE_SIGNED) value |= -(value & (1 << (mapping->size - 1))); return value; } /* * Set the bit string specified by mapping->offset and mapping->size * in the little-endian data stored at 'data' to the value 'value'. */ static void uvc_set_le_value(struct uvc_control_mapping *mapping, s32 value, u8 *data) { int bits = mapping->size; int offset = mapping->offset; u8 mask; /* * According to the v4l2 spec, writing any value to a button control * should result in the action belonging to the button control being * triggered. UVC devices however want to see a 1 written -> override * value. */ if (mapping->v4l2_type == V4L2_CTRL_TYPE_BUTTON) value = -1; data += offset / 8; offset &= 7; for (; bits > 0; data++) { mask = ((1LL << bits) - 1) << offset; *data = (*data & ~mask) | ((value << offset) & mask); value >>= offset ? offset : 8; bits -= 8 - offset; offset = 0; } } /* ------------------------------------------------------------------------ * Terminal and unit management */ static int uvc_entity_match_guid(const struct uvc_entity *entity, const u8 guid[16]) { return memcmp(entity->guid, guid, sizeof(entity->guid)) == 0; } /* ------------------------------------------------------------------------ * UVC Controls */ static void __uvc_find_control(struct uvc_entity *entity, u32 v4l2_id, struct uvc_control_mapping **mapping, struct uvc_control **control, int next) { struct uvc_control *ctrl; struct uvc_control_mapping *map; unsigned int i; if (entity == NULL) return; for (i = 0; i < entity->ncontrols; ++i) { ctrl = &entity->controls[i]; if (!ctrl->initialized) continue; list_for_each_entry(map, &ctrl->info.mappings, list) { if ((map->id == v4l2_id) && !next) { *control = ctrl; *mapping = map; return; } if ((*mapping == NULL || (*mapping)->id > map->id) && (map->id > v4l2_id) && next) { *control = ctrl; *mapping = map; } } } } static struct uvc_control *uvc_find_control(struct uvc_video_chain *chain, u32 v4l2_id, struct uvc_control_mapping **mapping) { struct uvc_control *ctrl = NULL; struct uvc_entity *entity; int next = v4l2_id & V4L2_CTRL_FLAG_NEXT_CTRL; *mapping = NULL; /* Mask the query flags. */ v4l2_id &= V4L2_CTRL_ID_MASK; /* Find the control. */ list_for_each_entry(entity, &chain->entities, chain) { __uvc_find_control(entity, v4l2_id, mapping, &ctrl, next); if (ctrl && !next) return ctrl; } if (ctrl == NULL && !next) uvc_dbg(chain->dev, CONTROL, "Control 0x%08x not found\n", v4l2_id); return ctrl; } static int uvc_ctrl_populate_cache(struct uvc_video_chain *chain, struct uvc_control *ctrl) { int ret; if (ctrl->info.flags & UVC_CTRL_FLAG_GET_DEF) { ret = uvc_query_ctrl(chain->dev, UVC_GET_DEF, ctrl->entity->id, chain->dev->intfnum, ctrl->info.selector, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_DEF), ctrl->info.size); if (ret < 0) return ret; } if (ctrl->info.flags & UVC_CTRL_FLAG_GET_MIN) { ret = uvc_query_ctrl(chain->dev, UVC_GET_MIN, ctrl->entity->id, chain->dev->intfnum, ctrl->info.selector, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_MIN), ctrl->info.size); if (ret < 0) return ret; } if (ctrl->info.flags & UVC_CTRL_FLAG_GET_MAX) { ret = uvc_query_ctrl(chain->dev, UVC_GET_MAX, ctrl->entity->id, chain->dev->intfnum, ctrl->info.selector, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_MAX), ctrl->info.size); if (ret < 0) return ret; } if (ctrl->info.flags & UVC_CTRL_FLAG_GET_RES) { ret = uvc_query_ctrl(chain->dev, UVC_GET_RES, ctrl->entity->id, chain->dev->intfnum, ctrl->info.selector, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_RES), ctrl->info.size); if (ret < 0) { if (UVC_ENTITY_TYPE(ctrl->entity) != UVC_VC_EXTENSION_UNIT) return ret; /* * GET_RES is mandatory for XU controls, but some * cameras still choke on it. Ignore errors and set the * resolution value to zero. */ uvc_warn_once(chain->dev, UVC_WARN_XU_GET_RES, "UVC non compliance - GET_RES failed on " "an XU control. Enabling workaround.\n"); memset(uvc_ctrl_data(ctrl, UVC_CTRL_DATA_RES), 0, ctrl->info.size); } } ctrl->cached = 1; return 0; } static s32 __uvc_ctrl_get_value(struct uvc_control_mapping *mapping, const u8 *data) { s32 value = mapping->get(mapping, UVC_GET_CUR, data); if (mapping->v4l2_type == V4L2_CTRL_TYPE_MENU) { unsigned int i; for (i = 0; BIT(i) <= mapping->menu_mask; ++i) { u32 menu_value; if (!test_bit(i, &mapping->menu_mask)) continue; menu_value = uvc_mapping_get_menu_value(mapping, i); if (menu_value == value) { value = i; break; } } } return value; } static int __uvc_ctrl_load_cur(struct uvc_video_chain *chain, struct uvc_control *ctrl) { u8 *data; int ret; if (ctrl->loaded) return 0; data = uvc_ctrl_data(ctrl, UVC_CTRL_DATA_CURRENT); if ((ctrl->info.flags & UVC_CTRL_FLAG_GET_CUR) == 0) { memset(data, 0, ctrl->info.size); ctrl->loaded = 1; return 0; } if (ctrl->entity->get_cur) ret = ctrl->entity->get_cur(chain->dev, ctrl->entity, ctrl->info.selector, data, ctrl->info.size); else ret = uvc_query_ctrl(chain->dev, UVC_GET_CUR, ctrl->entity->id, chain->dev->intfnum, ctrl->info.selector, data, ctrl->info.size); if (ret < 0) return ret; ctrl->loaded = 1; return ret; } static int __uvc_ctrl_get(struct uvc_video_chain *chain, struct uvc_control *ctrl, struct uvc_control_mapping *mapping, s32 *value) { int ret; if ((ctrl->info.flags & UVC_CTRL_FLAG_GET_CUR) == 0) return -EACCES; ret = __uvc_ctrl_load_cur(chain, ctrl); if (ret < 0) return ret; *value = __uvc_ctrl_get_value(mapping, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_CURRENT)); return 0; } static int __uvc_query_v4l2_class(struct uvc_video_chain *chain, u32 req_id, u32 found_id) { bool find_next = req_id & V4L2_CTRL_FLAG_NEXT_CTRL; unsigned int i; req_id &= V4L2_CTRL_ID_MASK; for (i = 0; i < ARRAY_SIZE(uvc_control_classes); i++) { if (!(chain->ctrl_class_bitmap & BIT(i))) continue; if (!find_next) { if (uvc_control_classes[i] == req_id) return i; continue; } if (uvc_control_classes[i] > req_id && uvc_control_classes[i] < found_id) return i; } return -ENODEV; } static int uvc_query_v4l2_class(struct uvc_video_chain *chain, u32 req_id, u32 found_id, struct v4l2_queryctrl *v4l2_ctrl) { int idx; idx = __uvc_query_v4l2_class(chain, req_id, found_id); if (idx < 0) return -ENODEV; memset(v4l2_ctrl, 0, sizeof(*v4l2_ctrl)); v4l2_ctrl->id = uvc_control_classes[idx]; strscpy(v4l2_ctrl->name, v4l2_ctrl_get_name(v4l2_ctrl->id), sizeof(v4l2_ctrl->name)); v4l2_ctrl->type = V4L2_CTRL_TYPE_CTRL_CLASS; v4l2_ctrl->flags = V4L2_CTRL_FLAG_WRITE_ONLY | V4L2_CTRL_FLAG_READ_ONLY; return 0; } /* * Check if control @v4l2_id can be accessed by the given control @ioctl * (VIDIOC_G_EXT_CTRLS, VIDIOC_TRY_EXT_CTRLS or VIDIOC_S_EXT_CTRLS). * * For set operations on slave controls, check if the master's value is set to * manual, either in the others controls set in the same ioctl call, or from * the master's current value. This catches VIDIOC_S_EXT_CTRLS calls that set * both the master and slave control, such as for instance setting * auto_exposure=1, exposure_time_absolute=251. */ int uvc_ctrl_is_accessible(struct uvc_video_chain *chain, u32 v4l2_id, const struct v4l2_ext_controls *ctrls, unsigned long ioctl) { struct uvc_control_mapping *master_map = NULL; struct uvc_control *master_ctrl = NULL; struct uvc_control_mapping *mapping; struct uvc_control *ctrl; bool read = ioctl == VIDIOC_G_EXT_CTRLS; s32 val; int ret; int i; if (__uvc_query_v4l2_class(chain, v4l2_id, 0) >= 0) return -EACCES; ctrl = uvc_find_control(chain, v4l2_id, &mapping); if (!ctrl) return -EINVAL; if (!(ctrl->info.flags & UVC_CTRL_FLAG_GET_CUR) && read) return -EACCES; if (!(ctrl->info.flags & UVC_CTRL_FLAG_SET_CUR) && !read) return -EACCES; if (ioctl != VIDIOC_S_EXT_CTRLS || !mapping->master_id) return 0; /* * Iterate backwards in cases where the master control is accessed * multiple times in the same ioctl. We want the last value. */ for (i = ctrls->count - 1; i >= 0; i--) { if (ctrls->controls[i].id == mapping->master_id) return ctrls->controls[i].value == mapping->master_manual ? 0 : -EACCES; } __uvc_find_control(ctrl->entity, mapping->master_id, &master_map, &master_ctrl, 0); if (!master_ctrl || !(master_ctrl->info.flags & UVC_CTRL_FLAG_GET_CUR)) return 0; ret = __uvc_ctrl_get(chain, master_ctrl, master_map, &val); if (ret >= 0 && val != mapping->master_manual) return -EACCES; return 0; } static const char *uvc_map_get_name(const struct uvc_control_mapping *map) { const char *name; if (map->name) return map->name; name = v4l2_ctrl_get_name(map->id); if (name) return name; return "Unknown Control"; } static u32 uvc_get_ctrl_bitmap(struct uvc_control *ctrl, struct uvc_control_mapping *mapping) { /* * Some controls, like CT_AE_MODE_CONTROL, use GET_RES to represent * the number of bits supported. Those controls do not list GET_MAX * as supported. */ if (ctrl->info.flags & UVC_CTRL_FLAG_GET_RES) return mapping->get(mapping, UVC_GET_RES, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_RES)); if (ctrl->info.flags & UVC_CTRL_FLAG_GET_MAX) return mapping->get(mapping, UVC_GET_MAX, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_MAX)); return ~0; } static int __uvc_query_v4l2_ctrl(struct uvc_video_chain *chain, struct uvc_control *ctrl, struct uvc_control_mapping *mapping, struct v4l2_queryctrl *v4l2_ctrl) { struct uvc_control_mapping *master_map = NULL; struct uvc_control *master_ctrl = NULL; unsigned int i; memset(v4l2_ctrl, 0, sizeof(*v4l2_ctrl)); v4l2_ctrl->id = mapping->id; v4l2_ctrl->type = mapping->v4l2_type; strscpy(v4l2_ctrl->name, uvc_map_get_name(mapping), sizeof(v4l2_ctrl->name)); v4l2_ctrl->flags = 0; if (!(ctrl->info.flags & UVC_CTRL_FLAG_GET_CUR)) v4l2_ctrl->flags |= V4L2_CTRL_FLAG_WRITE_ONLY; if (!(ctrl->info.flags & UVC_CTRL_FLAG_SET_CUR)) v4l2_ctrl->flags |= V4L2_CTRL_FLAG_READ_ONLY; if (mapping->master_id) __uvc_find_control(ctrl->entity, mapping->master_id, &master_map, &master_ctrl, 0); if (master_ctrl && (master_ctrl->info.flags & UVC_CTRL_FLAG_GET_CUR)) { s32 val; int ret = __uvc_ctrl_get(chain, master_ctrl, master_map, &val); if (ret < 0) return ret; if (val != mapping->master_manual) v4l2_ctrl->flags |= V4L2_CTRL_FLAG_INACTIVE; } if (!ctrl->cached) { int ret = uvc_ctrl_populate_cache(chain, ctrl); if (ret < 0) return ret; } if (ctrl->info.flags & UVC_CTRL_FLAG_GET_DEF) { v4l2_ctrl->default_value = mapping->get(mapping, UVC_GET_DEF, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_DEF)); } switch (mapping->v4l2_type) { case V4L2_CTRL_TYPE_MENU: v4l2_ctrl->minimum = ffs(mapping->menu_mask) - 1; v4l2_ctrl->maximum = fls(mapping->menu_mask) - 1; v4l2_ctrl->step = 1; for (i = 0; BIT(i) <= mapping->menu_mask; ++i) { u32 menu_value; if (!test_bit(i, &mapping->menu_mask)) continue; menu_value = uvc_mapping_get_menu_value(mapping, i); if (menu_value == v4l2_ctrl->default_value) { v4l2_ctrl->default_value = i; break; } } return 0; case V4L2_CTRL_TYPE_BOOLEAN: v4l2_ctrl->minimum = 0; v4l2_ctrl->maximum = 1; v4l2_ctrl->step = 1; return 0; case V4L2_CTRL_TYPE_BUTTON: v4l2_ctrl->minimum = 0; v4l2_ctrl->maximum = 0; v4l2_ctrl->step = 0; return 0; case V4L2_CTRL_TYPE_BITMASK: v4l2_ctrl->minimum = 0; v4l2_ctrl->maximum = uvc_get_ctrl_bitmap(ctrl, mapping); v4l2_ctrl->step = 0; return 0; default: break; } if (ctrl->info.flags & UVC_CTRL_FLAG_GET_MIN) v4l2_ctrl->minimum = mapping->get(mapping, UVC_GET_MIN, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_MIN)); if (ctrl->info.flags & UVC_CTRL_FLAG_GET_MAX) v4l2_ctrl->maximum = mapping->get(mapping, UVC_GET_MAX, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_MAX)); if (ctrl->info.flags & UVC_CTRL_FLAG_GET_RES) v4l2_ctrl->step = mapping->get(mapping, UVC_GET_RES, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_RES)); return 0; } int uvc_query_v4l2_ctrl(struct uvc_video_chain *chain, struct v4l2_queryctrl *v4l2_ctrl) { struct uvc_control *ctrl; struct uvc_control_mapping *mapping; int ret; ret = mutex_lock_interruptible(&chain->ctrl_mutex); if (ret < 0) return -ERESTARTSYS; /* Check if the ctrl is a know class */ if (!(v4l2_ctrl->id & V4L2_CTRL_FLAG_NEXT_CTRL)) { ret = uvc_query_v4l2_class(chain, v4l2_ctrl->id, 0, v4l2_ctrl); if (!ret) goto done; } ctrl = uvc_find_control(chain, v4l2_ctrl->id, &mapping); if (ctrl == NULL) { ret = -EINVAL; goto done; } /* * If we're enumerating control with V4L2_CTRL_FLAG_NEXT_CTRL, check if * a class should be inserted between the previous control and the one * we have just found. */ if (v4l2_ctrl->id & V4L2_CTRL_FLAG_NEXT_CTRL) { ret = uvc_query_v4l2_class(chain, v4l2_ctrl->id, mapping->id, v4l2_ctrl); if (!ret) goto done; } ret = __uvc_query_v4l2_ctrl(chain, ctrl, mapping, v4l2_ctrl); done: mutex_unlock(&chain->ctrl_mutex); return ret; } /* * Mapping V4L2 controls to UVC controls can be straightforward if done well. * Most of the UVC controls exist in V4L2, and can be mapped directly. Some * must be grouped (for instance the Red Balance, Blue Balance and Do White * Balance V4L2 controls use the White Balance Component UVC control) or * otherwise translated. The approach we take here is to use a translation * table for the controls that can be mapped directly, and handle the others * manually. */ int uvc_query_v4l2_menu(struct uvc_video_chain *chain, struct v4l2_querymenu *query_menu) { struct uvc_control_mapping *mapping; struct uvc_control *ctrl; u32 index = query_menu->index; u32 id = query_menu->id; const char *name; int ret; memset(query_menu, 0, sizeof(*query_menu)); query_menu->id = id; query_menu->index = index; if (index >= BITS_PER_TYPE(mapping->menu_mask)) return -EINVAL; ret = mutex_lock_interruptible(&chain->ctrl_mutex); if (ret < 0) return -ERESTARTSYS; ctrl = uvc_find_control(chain, query_menu->id, &mapping); if (ctrl == NULL || mapping->v4l2_type != V4L2_CTRL_TYPE_MENU) { ret = -EINVAL; goto done; } if (!test_bit(query_menu->index, &mapping->menu_mask)) { ret = -EINVAL; goto done; } if (mapping->data_type == UVC_CTRL_DATA_TYPE_BITMASK) { int mask; if (!ctrl->cached) { ret = uvc_ctrl_populate_cache(chain, ctrl); if (ret < 0) goto done; } mask = uvc_mapping_get_menu_value(mapping, query_menu->index); if (mask < 0) { ret = mask; goto done; } if (!(uvc_get_ctrl_bitmap(ctrl, mapping) & mask)) { ret = -EINVAL; goto done; } } name = uvc_mapping_get_menu_name(mapping, query_menu->index); if (!name) { ret = -EINVAL; goto done; } strscpy(query_menu->name, name, sizeof(query_menu->name)); done: mutex_unlock(&chain->ctrl_mutex); return ret; } /* -------------------------------------------------------------------------- * Ctrl event handling */ static void uvc_ctrl_fill_event(struct uvc_video_chain *chain, struct v4l2_event *ev, struct uvc_control *ctrl, struct uvc_control_mapping *mapping, s32 value, u32 changes) { struct v4l2_queryctrl v4l2_ctrl; __uvc_query_v4l2_ctrl(chain, ctrl, mapping, &v4l2_ctrl); memset(ev, 0, sizeof(*ev)); ev->type = V4L2_EVENT_CTRL; ev->id = v4l2_ctrl.id; ev->u.ctrl.value = value; ev->u.ctrl.changes = changes; ev->u.ctrl.type = v4l2_ctrl.type; ev->u.ctrl.flags = v4l2_ctrl.flags; ev->u.ctrl.minimum = v4l2_ctrl.minimum; ev->u.ctrl.maximum = v4l2_ctrl.maximum; ev->u.ctrl.step = v4l2_ctrl.step; ev->u.ctrl.default_value = v4l2_ctrl.default_value; } /* * Send control change events to all subscribers for the @ctrl control. By * default the subscriber that generated the event, as identified by @handle, * is not notified unless it has set the V4L2_EVENT_SUB_FL_ALLOW_FEEDBACK flag. * @handle can be NULL for asynchronous events related to auto-update controls, * in which case all subscribers are notified. */ static void uvc_ctrl_send_event(struct uvc_video_chain *chain, struct uvc_fh *handle, struct uvc_control *ctrl, struct uvc_control_mapping *mapping, s32 value, u32 changes) { struct v4l2_fh *originator = handle ? &handle->vfh : NULL; struct v4l2_subscribed_event *sev; struct v4l2_event ev; if (list_empty(&mapping->ev_subs)) return; uvc_ctrl_fill_event(chain, &ev, ctrl, mapping, value, changes); list_for_each_entry(sev, &mapping->ev_subs, node) { if (sev->fh != originator || (sev->flags & V4L2_EVENT_SUB_FL_ALLOW_FEEDBACK) || (changes & V4L2_EVENT_CTRL_CH_FLAGS)) v4l2_event_queue_fh(sev->fh, &ev); } } /* * Send control change events for the slave of the @master control identified * by the V4L2 ID @slave_id. The @handle identifies the event subscriber that * generated the event and may be NULL for auto-update events. */ static void uvc_ctrl_send_slave_event(struct uvc_video_chain *chain, struct uvc_fh *handle, struct uvc_control *master, u32 slave_id) { struct uvc_control_mapping *mapping = NULL; struct uvc_control *ctrl = NULL; u32 changes = V4L2_EVENT_CTRL_CH_FLAGS; s32 val = 0; __uvc_find_control(master->entity, slave_id, &mapping, &ctrl, 0); if (ctrl == NULL) return; if (__uvc_ctrl_get(chain, ctrl, mapping, &val) == 0) changes |= V4L2_EVENT_CTRL_CH_VALUE; uvc_ctrl_send_event(chain, handle, ctrl, mapping, val, changes); } void uvc_ctrl_status_event(struct uvc_video_chain *chain, struct uvc_control *ctrl, const u8 *data) { struct uvc_control_mapping *mapping; struct uvc_fh *handle; unsigned int i; mutex_lock(&chain->ctrl_mutex); handle = ctrl->handle; ctrl->handle = NULL; list_for_each_entry(mapping, &ctrl->info.mappings, list) { s32 value = __uvc_ctrl_get_value(mapping, data); /* * handle may be NULL here if the device sends auto-update * events without a prior related control set from userspace. */ for (i = 0; i < ARRAY_SIZE(mapping->slave_ids); ++i) { if (!mapping->slave_ids[i]) break; uvc_ctrl_send_slave_event(chain, handle, ctrl, mapping->slave_ids[i]); } uvc_ctrl_send_event(chain, handle, ctrl, mapping, value, V4L2_EVENT_CTRL_CH_VALUE); } mutex_unlock(&chain->ctrl_mutex); } static void uvc_ctrl_status_event_work(struct work_struct *work) { struct uvc_device *dev = container_of(work, struct uvc_device, async_ctrl.work); struct uvc_ctrl_work *w = &dev->async_ctrl; int ret; uvc_ctrl_status_event(w->chain, w->ctrl, w->data); /* The barrier is needed to synchronize with uvc_status_stop(). */ if (smp_load_acquire(&dev->flush_status)) return; /* Resubmit the URB. */ w->urb->interval = dev->int_ep->desc.bInterval; ret = usb_submit_urb(w->urb, GFP_KERNEL); if (ret < 0) dev_err(&dev->udev->dev, "Failed to resubmit status URB (%d).\n", ret); } bool uvc_ctrl_status_event_async(struct urb *urb, struct uvc_video_chain *chain, struct uvc_control *ctrl, const u8 *data) { struct uvc_device *dev = chain->dev; struct uvc_ctrl_work *w = &dev->async_ctrl; if (list_empty(&ctrl->info.mappings)) { ctrl->handle = NULL; return false; } w->data = data; w->urb = urb; w->chain = chain; w->ctrl = ctrl; schedule_work(&w->work); return true; } static bool uvc_ctrl_xctrls_has_control(const struct v4l2_ext_control *xctrls, unsigned int xctrls_count, u32 id) { unsigned int i; for (i = 0; i < xctrls_count; ++i) { if (xctrls[i].id == id) return true; } return false; } static void uvc_ctrl_send_events(struct uvc_fh *handle, const struct v4l2_ext_control *xctrls, unsigned int xctrls_count) { struct uvc_control_mapping *mapping; struct uvc_control *ctrl; u32 changes = V4L2_EVENT_CTRL_CH_VALUE; unsigned int i; unsigned int j; for (i = 0; i < xctrls_count; ++i) { ctrl = uvc_find_control(handle->chain, xctrls[i].id, &mapping); if (ctrl->info.flags & UVC_CTRL_FLAG_ASYNCHRONOUS) /* Notification will be sent from an Interrupt event. */ continue; for (j = 0; j < ARRAY_SIZE(mapping->slave_ids); ++j) { u32 slave_id = mapping->slave_ids[j]; if (!slave_id) break; /* * We can skip sending an event for the slave if the * slave is being modified in the same transaction. */ if (uvc_ctrl_xctrls_has_control(xctrls, xctrls_count, slave_id)) continue; uvc_ctrl_send_slave_event(handle->chain, handle, ctrl, slave_id); } /* * If the master is being modified in the same transaction * flags may change too. */ if (mapping->master_id && uvc_ctrl_xctrls_has_control(xctrls, xctrls_count, mapping->master_id)) changes |= V4L2_EVENT_CTRL_CH_FLAGS; uvc_ctrl_send_event(handle->chain, handle, ctrl, mapping, xctrls[i].value, changes); } } static int uvc_ctrl_add_event(struct v4l2_subscribed_event *sev, unsigned elems) { struct uvc_fh *handle = container_of(sev->fh, struct uvc_fh, vfh); struct uvc_control_mapping *mapping; struct uvc_control *ctrl; int ret; ret = mutex_lock_interruptible(&handle->chain->ctrl_mutex); if (ret < 0) return -ERESTARTSYS; if (__uvc_query_v4l2_class(handle->chain, sev->id, 0) >= 0) { ret = 0; goto done; } ctrl = uvc_find_control(handle->chain, sev->id, &mapping); if (ctrl == NULL) { ret = -EINVAL; goto done; } list_add_tail(&sev->node, &mapping->ev_subs); if (sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL) { struct v4l2_event ev; u32 changes = V4L2_EVENT_CTRL_CH_FLAGS; s32 val = 0; if (__uvc_ctrl_get(handle->chain, ctrl, mapping, &val) == 0) changes |= V4L2_EVENT_CTRL_CH_VALUE; uvc_ctrl_fill_event(handle->chain, &ev, ctrl, mapping, val, changes); /* * Mark the queue as active, allowing this initial event to be * accepted. */ sev->elems = elems; v4l2_event_queue_fh(sev->fh, &ev); } done: mutex_unlock(&handle->chain->ctrl_mutex); return ret; } static void uvc_ctrl_del_event(struct v4l2_subscribed_event *sev) { struct uvc_fh *handle = container_of(sev->fh, struct uvc_fh, vfh); mutex_lock(&handle->chain->ctrl_mutex); if (__uvc_query_v4l2_class(handle->chain, sev->id, 0) >= 0) goto done; list_del(&sev->node); done: mutex_unlock(&handle->chain->ctrl_mutex); } const struct v4l2_subscribed_event_ops uvc_ctrl_sub_ev_ops = { .add = uvc_ctrl_add_event, .del = uvc_ctrl_del_event, .replace = v4l2_ctrl_replace, .merge = v4l2_ctrl_merge, }; /* -------------------------------------------------------------------------- * Control transactions * * To make extended set operations as atomic as the hardware allows, controls * are handled using begin/commit/rollback operations. * * At the beginning of a set request, uvc_ctrl_begin should be called to * initialize the request. This function acquires the control lock. * * When setting a control, the new value is stored in the control data field * at position UVC_CTRL_DATA_CURRENT. The control is then marked as dirty for * later processing. If the UVC and V4L2 control sizes differ, the current * value is loaded from the hardware before storing the new value in the data * field. * * After processing all controls in the transaction, uvc_ctrl_commit or * uvc_ctrl_rollback must be called to apply the pending changes to the * hardware or revert them. When applying changes, all controls marked as * dirty will be modified in the UVC device, and the dirty flag will be * cleared. When reverting controls, the control data field * UVC_CTRL_DATA_CURRENT is reverted to its previous value * (UVC_CTRL_DATA_BACKUP) for all dirty controls. Both functions release the * control lock. */ int uvc_ctrl_begin(struct uvc_video_chain *chain) { return mutex_lock_interruptible(&chain->ctrl_mutex) ? -ERESTARTSYS : 0; } static int uvc_ctrl_commit_entity(struct uvc_device *dev, struct uvc_entity *entity, int rollback, struct uvc_control **err_ctrl) { struct uvc_control *ctrl; unsigned int i; int ret; if (entity == NULL) return 0; for (i = 0; i < entity->ncontrols; ++i) { ctrl = &entity->controls[i]; if (!ctrl->initialized) continue; /* * Reset the loaded flag for auto-update controls that were * marked as loaded in uvc_ctrl_get/uvc_ctrl_set to prevent * uvc_ctrl_get from using the cached value, and for write-only * controls to prevent uvc_ctrl_set from setting bits not * explicitly set by the user. */ if (ctrl->info.flags & UVC_CTRL_FLAG_AUTO_UPDATE || !(ctrl->info.flags & UVC_CTRL_FLAG_GET_CUR)) ctrl->loaded = 0; if (!ctrl->dirty) continue; if (!rollback) ret = uvc_query_ctrl(dev, UVC_SET_CUR, ctrl->entity->id, dev->intfnum, ctrl->info.selector, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_CURRENT), ctrl->info.size); else ret = 0; if (rollback || ret < 0) memcpy(uvc_ctrl_data(ctrl, UVC_CTRL_DATA_CURRENT), uvc_ctrl_data(ctrl, UVC_CTRL_DATA_BACKUP), ctrl->info.size); ctrl->dirty = 0; if (ret < 0) { if (err_ctrl) *err_ctrl = ctrl; return ret; } } return 0; } static int uvc_ctrl_find_ctrl_idx(struct uvc_entity *entity, struct v4l2_ext_controls *ctrls, struct uvc_control *uvc_control) { struct uvc_control_mapping *mapping = NULL; struct uvc_control *ctrl_found = NULL; unsigned int i; if (!entity) return ctrls->count; for (i = 0; i < ctrls->count; i++) { __uvc_find_control(entity, ctrls->controls[i].id, &mapping, &ctrl_found, 0); if (uvc_control == ctrl_found) return i; } return ctrls->count; } int __uvc_ctrl_commit(struct uvc_fh *handle, int rollback, struct v4l2_ext_controls *ctrls) { struct uvc_video_chain *chain = handle->chain; struct uvc_control *err_ctrl; struct uvc_entity *entity; int ret = 0; /* Find the control. */ list_for_each_entry(entity, &chain->entities, chain) { ret = uvc_ctrl_commit_entity(chain->dev, entity, rollback, &err_ctrl); if (ret < 0) goto done; } if (!rollback) uvc_ctrl_send_events(handle, ctrls->controls, ctrls->count); done: if (ret < 0 && ctrls) ctrls->error_idx = uvc_ctrl_find_ctrl_idx(entity, ctrls, err_ctrl); mutex_unlock(&chain->ctrl_mutex); return ret; } int uvc_ctrl_get(struct uvc_video_chain *chain, struct v4l2_ext_control *xctrl) { struct uvc_control *ctrl; struct uvc_control_mapping *mapping; if (__uvc_query_v4l2_class(chain, xctrl->id, 0) >= 0) return -EACCES; ctrl = uvc_find_control(chain, xctrl->id, &mapping); if (ctrl == NULL) return -EINVAL; return __uvc_ctrl_get(chain, ctrl, mapping, &xctrl->value); } int uvc_ctrl_set(struct uvc_fh *handle, struct v4l2_ext_control *xctrl) { struct uvc_video_chain *chain = handle->chain; struct uvc_control *ctrl; struct uvc_control_mapping *mapping; s32 value; u32 step; s32 min; s32 max; int ret; if (__uvc_query_v4l2_class(chain, xctrl->id, 0) >= 0) return -EACCES; ctrl = uvc_find_control(chain, xctrl->id, &mapping); if (ctrl == NULL) return -EINVAL; if (!(ctrl->info.flags & UVC_CTRL_FLAG_SET_CUR)) return -EACCES; /* Clamp out of range values. */ switch (mapping->v4l2_type) { case V4L2_CTRL_TYPE_INTEGER: if (!ctrl->cached) { ret = uvc_ctrl_populate_cache(chain, ctrl); if (ret < 0) return ret; } min = mapping->get(mapping, UVC_GET_MIN, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_MIN)); max = mapping->get(mapping, UVC_GET_MAX, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_MAX)); step = mapping->get(mapping, UVC_GET_RES, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_RES)); if (step == 0) step = 1; xctrl->value = min + DIV_ROUND_CLOSEST((u32)(xctrl->value - min), step) * step; if (mapping->data_type == UVC_CTRL_DATA_TYPE_SIGNED) xctrl->value = clamp(xctrl->value, min, max); else xctrl->value = clamp_t(u32, xctrl->value, min, max); value = xctrl->value; break; case V4L2_CTRL_TYPE_BITMASK: if (!ctrl->cached) { ret = uvc_ctrl_populate_cache(chain, ctrl); if (ret < 0) return ret; } xctrl->value &= uvc_get_ctrl_bitmap(ctrl, mapping); value = xctrl->value; break; case V4L2_CTRL_TYPE_BOOLEAN: xctrl->value = clamp(xctrl->value, 0, 1); value = xctrl->value; break; case V4L2_CTRL_TYPE_MENU: if (xctrl->value < (ffs(mapping->menu_mask) - 1) || xctrl->value > (fls(mapping->menu_mask) - 1)) return -ERANGE; if (!test_bit(xctrl->value, &mapping->menu_mask)) return -EINVAL; value = uvc_mapping_get_menu_value(mapping, xctrl->value); /* * Valid menu indices are reported by the GET_RES request for * UVC controls that support it. */ if (mapping->data_type == UVC_CTRL_DATA_TYPE_BITMASK) { if (!ctrl->cached) { ret = uvc_ctrl_populate_cache(chain, ctrl); if (ret < 0) return ret; } if (!(uvc_get_ctrl_bitmap(ctrl, mapping) & value)) return -EINVAL; } break; default: value = xctrl->value; break; } /* * If the mapping doesn't span the whole UVC control, the current value * needs to be loaded from the device to perform the read-modify-write * operation. */ if ((ctrl->info.size * 8) != mapping->size) { ret = __uvc_ctrl_load_cur(chain, ctrl); if (ret < 0) return ret; } /* Backup the current value in case we need to rollback later. */ if (!ctrl->dirty) { memcpy(uvc_ctrl_data(ctrl, UVC_CTRL_DATA_BACKUP), uvc_ctrl_data(ctrl, UVC_CTRL_DATA_CURRENT), ctrl->info.size); } mapping->set(mapping, value, uvc_ctrl_data(ctrl, UVC_CTRL_DATA_CURRENT)); if (ctrl->info.flags & UVC_CTRL_FLAG_ASYNCHRONOUS) ctrl->handle = handle; ctrl->dirty = 1; ctrl->modified = 1; return 0; } /* -------------------------------------------------------------------------- * Dynamic controls */ /* * Retrieve flags for a given control */ static int uvc_ctrl_get_flags(struct uvc_device *dev, const struct uvc_control *ctrl, struct uvc_control_info *info) { u8 *data; int ret; data = kmalloc(1, GFP_KERNEL); if (data == NULL) return -ENOMEM; if (ctrl->entity->get_info) ret = ctrl->entity->get_info(dev, ctrl->entity, ctrl->info.selector, data); else ret = uvc_query_ctrl(dev, UVC_GET_INFO, ctrl->entity->id, dev->intfnum, info->selector, data, 1); if (!ret) info->flags |= (data[0] & UVC_CONTROL_CAP_GET ? UVC_CTRL_FLAG_GET_CUR : 0) | (data[0] & UVC_CONTROL_CAP_SET ? UVC_CTRL_FLAG_SET_CUR : 0) | (data[0] & UVC_CONTROL_CAP_AUTOUPDATE ? UVC_CTRL_FLAG_AUTO_UPDATE : 0) | (data[0] & UVC_CONTROL_CAP_ASYNCHRONOUS ? UVC_CTRL_FLAG_ASYNCHRONOUS : 0); kfree(data); return ret; } static void uvc_ctrl_fixup_xu_info(struct uvc_device *dev, const struct uvc_control *ctrl, struct uvc_control_info *info) { struct uvc_ctrl_fixup { struct usb_device_id id; u8 entity; u8 selector; u8 flags; }; static const struct uvc_ctrl_fixup fixups[] = { { { USB_DEVICE(0x046d, 0x08c2) }, 9, 1, UVC_CTRL_FLAG_GET_MIN | UVC_CTRL_FLAG_GET_MAX | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_AUTO_UPDATE }, { { USB_DEVICE(0x046d, 0x08cc) }, 9, 1, UVC_CTRL_FLAG_GET_MIN | UVC_CTRL_FLAG_GET_MAX | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_AUTO_UPDATE }, { { USB_DEVICE(0x046d, 0x0994) }, 9, 1, UVC_CTRL_FLAG_GET_MIN | UVC_CTRL_FLAG_GET_MAX | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_SET_CUR | UVC_CTRL_FLAG_AUTO_UPDATE }, }; unsigned int i; for (i = 0; i < ARRAY_SIZE(fixups); ++i) { if (!usb_match_one_id(dev->intf, &fixups[i].id)) continue; if (fixups[i].entity == ctrl->entity->id && fixups[i].selector == info->selector) { info->flags = fixups[i].flags; return; } } } /* * Query control information (size and flags) for XU controls. */ static int uvc_ctrl_fill_xu_info(struct uvc_device *dev, const struct uvc_control *ctrl, struct uvc_control_info *info) { u8 *data; int ret; data = kmalloc(2, GFP_KERNEL); if (data == NULL) return -ENOMEM; memcpy(info->entity, ctrl->entity->guid, sizeof(info->entity)); info->index = ctrl->index; info->selector = ctrl->index + 1; /* Query and verify the control length (GET_LEN) */ ret = uvc_query_ctrl(dev, UVC_GET_LEN, ctrl->entity->id, dev->intfnum, info->selector, data, 2); if (ret < 0) { uvc_dbg(dev, CONTROL, "GET_LEN failed on control %pUl/%u (%d)\n", info->entity, info->selector, ret); goto done; } info->size = le16_to_cpup((__le16 *)data); info->flags = UVC_CTRL_FLAG_GET_MIN | UVC_CTRL_FLAG_GET_MAX | UVC_CTRL_FLAG_GET_RES | UVC_CTRL_FLAG_GET_DEF; ret = uvc_ctrl_get_flags(dev, ctrl, info); if (ret < 0) { uvc_dbg(dev, CONTROL, "Failed to get flags for control %pUl/%u (%d)\n", info->entity, info->selector, ret); goto done; } uvc_ctrl_fixup_xu_info(dev, ctrl, info); uvc_dbg(dev, CONTROL, "XU control %pUl/%u queried: len %u, flags { get %u set %u auto %u }\n", info->entity, info->selector, info->size, (info->flags & UVC_CTRL_FLAG_GET_CUR) ? 1 : 0, (info->flags & UVC_CTRL_FLAG_SET_CUR) ? 1 : 0, (info->flags & UVC_CTRL_FLAG_AUTO_UPDATE) ? 1 : 0); done: kfree(data); return ret; } static int uvc_ctrl_add_info(struct uvc_device *dev, struct uvc_control *ctrl, const struct uvc_control_info *info); static int uvc_ctrl_init_xu_ctrl(struct uvc_device *dev, struct uvc_control *ctrl) { struct uvc_control_info info; int ret; if (ctrl->initialized) return 0; ret = uvc_ctrl_fill_xu_info(dev, ctrl, &info); if (ret < 0) return ret; ret = uvc_ctrl_add_info(dev, ctrl, &info); if (ret < 0) uvc_dbg(dev, CONTROL, "Failed to initialize control %pUl/%u on device %s entity %u\n", info.entity, info.selector, dev->udev->devpath, ctrl->entity->id); return ret; } int uvc_xu_ctrl_query(struct uvc_video_chain *chain, struct uvc_xu_control_query *xqry) { struct uvc_entity *entity; struct uvc_control *ctrl; unsigned int i; bool found; u32 reqflags; u16 size; u8 *data = NULL; int ret; /* Find the extension unit. */ found = false; list_for_each_entry(entity, &chain->entities, chain) { if (UVC_ENTITY_TYPE(entity) == UVC_VC_EXTENSION_UNIT && entity->id == xqry->unit) { found = true; break; } } if (!found) { uvc_dbg(chain->dev, CONTROL, "Extension unit %u not found\n", xqry->unit); return -ENOENT; } /* Find the control and perform delayed initialization if needed. */ found = false; for (i = 0; i < entity->ncontrols; ++i) { ctrl = &entity->controls[i]; if (ctrl->index == xqry->selector - 1) { found = true; break; } } if (!found) { uvc_dbg(chain->dev, CONTROL, "Control %pUl/%u not found\n", entity->guid, xqry->selector); return -ENOENT; } if (mutex_lock_interruptible(&chain->ctrl_mutex)) return -ERESTARTSYS; ret = uvc_ctrl_init_xu_ctrl(chain->dev, ctrl); if (ret < 0) { ret = -ENOENT; goto done; } /* Validate the required buffer size and flags for the request */ reqflags = 0; size = ctrl->info.size; switch (xqry->query) { case UVC_GET_CUR: reqflags = UVC_CTRL_FLAG_GET_CUR; break; case UVC_GET_MIN: reqflags = UVC_CTRL_FLAG_GET_MIN; break; case UVC_GET_MAX: reqflags = UVC_CTRL_FLAG_GET_MAX; break; case UVC_GET_DEF: reqflags = UVC_CTRL_FLAG_GET_DEF; break; case UVC_GET_RES: reqflags = UVC_CTRL_FLAG_GET_RES; break; case UVC_SET_CUR: reqflags = UVC_CTRL_FLAG_SET_CUR; break; case UVC_GET_LEN: size = 2; break; case UVC_GET_INFO: size = 1; break; default: ret = -EINVAL; goto done; } if (size != xqry->size) { ret = -ENOBUFS; goto done; } if (reqflags && !(ctrl->info.flags & reqflags)) { ret = -EBADRQC; goto done; } data = kmalloc(size, GFP_KERNEL); if (data == NULL) { ret = -ENOMEM; goto done; } if (xqry->query == UVC_SET_CUR && copy_from_user(data, xqry->data, size)) { ret = -EFAULT; goto done; } ret = uvc_query_ctrl(chain->dev, xqry->query, xqry->unit, chain->dev->intfnum, xqry->selector, data, size); if (ret < 0) goto done; if (xqry->query != UVC_SET_CUR && copy_to_user(xqry->data, data, size)) ret = -EFAULT; done: kfree(data); mutex_unlock(&chain->ctrl_mutex); return ret; } /* -------------------------------------------------------------------------- * Suspend/resume */ /* * Restore control values after resume, skipping controls that haven't been * changed. * * TODO * - Don't restore modified controls that are back to their default value. * - Handle restore order (Auto-Exposure Mode should be restored before * Exposure Time). */ int uvc_ctrl_restore_values(struct uvc_device *dev) { struct uvc_control *ctrl; struct uvc_entity *entity; unsigned int i; int ret; /* Walk the entities list and restore controls when possible. */ list_for_each_entry(entity, &dev->entities, list) { for (i = 0; i < entity->ncontrols; ++i) { ctrl = &entity->controls[i]; if (!ctrl->initialized || !ctrl->modified || (ctrl->info.flags & UVC_CTRL_FLAG_RESTORE) == 0) continue; dev_dbg(&dev->udev->dev, "restoring control %pUl/%u/%u\n", ctrl->info.entity, ctrl->info.index, ctrl->info.selector); ctrl->dirty = 1; } ret = uvc_ctrl_commit_entity(dev, entity, 0, NULL); if (ret < 0) return ret; } return 0; } /* -------------------------------------------------------------------------- * Control and mapping handling */ /* * Add control information to a given control. */ static int uvc_ctrl_add_info(struct uvc_device *dev, struct uvc_control *ctrl, const struct uvc_control_info *info) { ctrl->info = *info; INIT_LIST_HEAD(&ctrl->info.mappings); /* Allocate an array to save control values (cur, def, max, etc.) */ ctrl->uvc_data = kzalloc(ctrl->info.size * UVC_CTRL_DATA_LAST + 1, GFP_KERNEL); if (!ctrl->uvc_data) return -ENOMEM; ctrl->initialized = 1; uvc_dbg(dev, CONTROL, "Added control %pUl/%u to device %s entity %u\n", ctrl->info.entity, ctrl->info.selector, dev->udev->devpath, ctrl->entity->id); return 0; } /* * Add a control mapping to a given control. */ static int __uvc_ctrl_add_mapping(struct uvc_video_chain *chain, struct uvc_control *ctrl, const struct uvc_control_mapping *mapping) { struct uvc_control_mapping *map; unsigned int size; unsigned int i; /* * Most mappings come from static kernel data, and need to be duplicated. * Mappings that come from userspace will be unnecessarily duplicated, * this could be optimized. */ map = kmemdup(mapping, sizeof(*mapping), GFP_KERNEL); if (!map) return -ENOMEM; map->name = NULL; map->menu_names = NULL; map->menu_mapping = NULL; /* For UVCIOC_CTRL_MAP custom control */ if (mapping->name) { map->name = kstrdup(mapping->name, GFP_KERNEL); if (!map->name) goto err_nomem; } INIT_LIST_HEAD(&map->ev_subs); if (mapping->menu_mapping && mapping->menu_mask) { size = sizeof(mapping->menu_mapping[0]) * fls(mapping->menu_mask); map->menu_mapping = kmemdup(mapping->menu_mapping, size, GFP_KERNEL); if (!map->menu_mapping) goto err_nomem; } if (mapping->menu_names && mapping->menu_mask) { size = sizeof(mapping->menu_names[0]) * fls(mapping->menu_mask); map->menu_names = kmemdup(mapping->menu_names, size, GFP_KERNEL); if (!map->menu_names) goto err_nomem; } if (map->get == NULL) map->get = uvc_get_le_value; if (map->set == NULL) map->set = uvc_set_le_value; for (i = 0; i < ARRAY_SIZE(uvc_control_classes); i++) { if (V4L2_CTRL_ID2WHICH(uvc_control_classes[i]) == V4L2_CTRL_ID2WHICH(map->id)) { chain->ctrl_class_bitmap |= BIT(i); break; } } list_add_tail(&map->list, &ctrl->info.mappings); uvc_dbg(chain->dev, CONTROL, "Adding mapping '%s' to control %pUl/%u\n", uvc_map_get_name(map), ctrl->info.entity, ctrl->info.selector); return 0; err_nomem: kfree(map->menu_names); kfree(map->menu_mapping); kfree(map->name); kfree(map); return -ENOMEM; } int uvc_ctrl_add_mapping(struct uvc_video_chain *chain, const struct uvc_control_mapping *mapping) { struct uvc_device *dev = chain->dev; struct uvc_control_mapping *map; struct uvc_entity *entity; struct uvc_control *ctrl; int found = 0; int ret; if (mapping->id & ~V4L2_CTRL_ID_MASK) { uvc_dbg(dev, CONTROL, "Can't add mapping '%s', control id 0x%08x is invalid\n", uvc_map_get_name(mapping), mapping->id); return -EINVAL; } /* Search for the matching (GUID/CS) control on the current chain */ list_for_each_entry(entity, &chain->entities, chain) { unsigned int i; if (UVC_ENTITY_TYPE(entity) != UVC_VC_EXTENSION_UNIT || !uvc_entity_match_guid(entity, mapping->entity)) continue; for (i = 0; i < entity->ncontrols; ++i) { ctrl = &entity->controls[i]; if (ctrl->index == mapping->selector - 1) { found = 1; break; } } if (found) break; } if (!found) return -ENOENT; if (mutex_lock_interruptible(&chain->ctrl_mutex)) return -ERESTARTSYS; /* Perform delayed initialization of XU controls */ ret = uvc_ctrl_init_xu_ctrl(dev, ctrl); if (ret < 0) { ret = -ENOENT; goto done; } /* Validate the user-provided bit-size and offset */ if (mapping->size > 32 || mapping->offset + mapping->size > ctrl->info.size * 8) { ret = -EINVAL; goto done; } list_for_each_entry(map, &ctrl->info.mappings, list) { if (mapping->id == map->id) { uvc_dbg(dev, CONTROL, "Can't add mapping '%s', control id 0x%08x already exists\n", uvc_map_get_name(mapping), mapping->id); ret = -EEXIST; goto done; } } /* Prevent excess memory consumption */ if (atomic_inc_return(&dev->nmappings) > UVC_MAX_CONTROL_MAPPINGS) { atomic_dec(&dev->nmappings); uvc_dbg(dev, CONTROL, "Can't add mapping '%s', maximum mappings count (%u) exceeded\n", uvc_map_get_name(mapping), UVC_MAX_CONTROL_MAPPINGS); ret = -ENOMEM; goto done; } ret = __uvc_ctrl_add_mapping(chain, ctrl, mapping); if (ret < 0) atomic_dec(&dev->nmappings); done: mutex_unlock(&chain->ctrl_mutex); return ret; } /* * Prune an entity of its bogus controls using a blacklist. Bogus controls * are currently the ones that crash the camera or unconditionally return an * error when queried. */ static void uvc_ctrl_prune_entity(struct uvc_device *dev, struct uvc_entity *entity) { struct uvc_ctrl_blacklist { struct usb_device_id id; u8 index; }; static const struct uvc_ctrl_blacklist processing_blacklist[] = { { { USB_DEVICE(0x13d3, 0x509b) }, 9 }, /* Gain */ { { USB_DEVICE(0x1c4f, 0x3000) }, 6 }, /* WB Temperature */ { { USB_DEVICE(0x5986, 0x0241) }, 2 }, /* Hue */ }; static const struct uvc_ctrl_blacklist camera_blacklist[] = { { { USB_DEVICE(0x06f8, 0x3005) }, 9 }, /* Zoom, Absolute */ }; const struct uvc_ctrl_blacklist *blacklist; unsigned int size; unsigned int count; unsigned int i; u8 *controls; switch (UVC_ENTITY_TYPE(entity)) { case UVC_VC_PROCESSING_UNIT: blacklist = processing_blacklist; count = ARRAY_SIZE(processing_blacklist); controls = entity->processing.bmControls; size = entity->processing.bControlSize; break; case UVC_ITT_CAMERA: blacklist = camera_blacklist; count = ARRAY_SIZE(camera_blacklist); controls = entity->camera.bmControls; size = entity->camera.bControlSize; break; default: return; } for (i = 0; i < count; ++i) { if (!usb_match_one_id(dev->intf, &blacklist[i].id)) continue; if (blacklist[i].index >= 8 * size || !uvc_test_bit(controls, blacklist[i].index)) continue; uvc_dbg(dev, CONTROL, "%u/%u control is black listed, removing it\n", entity->id, blacklist[i].index); uvc_clear_bit(controls, blacklist[i].index); } } /* * Add control information and hardcoded stock control mappings to the given * device. */ static void uvc_ctrl_init_ctrl(struct uvc_video_chain *chain, struct uvc_control *ctrl) { const struct uvc_control_mapping **mappings; unsigned int i; /* * XU controls initialization requires querying the device for control * information. As some buggy UVC devices will crash when queried * repeatedly in a tight loop, delay XU controls initialization until * first use. */ if (UVC_ENTITY_TYPE(ctrl->entity) == UVC_VC_EXTENSION_UNIT) return; for (i = 0; i < ARRAY_SIZE(uvc_ctrls); ++i) { const struct uvc_control_info *info = &uvc_ctrls[i]; if (uvc_entity_match_guid(ctrl->entity, info->entity) && ctrl->index == info->index) { uvc_ctrl_add_info(chain->dev, ctrl, info); /* * Retrieve control flags from the device. Ignore errors * and work with default flag values from the uvc_ctrl * array when the device doesn't properly implement * GET_INFO on standard controls. */ uvc_ctrl_get_flags(chain->dev, ctrl, &ctrl->info); break; } } if (!ctrl->initialized) return; /* * First check if the device provides a custom mapping for this control, * used to override standard mappings for non-conformant devices. Don't * process standard mappings if a custom mapping is found. This * mechanism doesn't support combining standard and custom mappings for * a single control. */ if (chain->dev->info->mappings) { bool custom = false; for (i = 0; chain->dev->info->mappings[i]; ++i) { const struct uvc_control_mapping *mapping = chain->dev->info->mappings[i]; if (uvc_entity_match_guid(ctrl->entity, mapping->entity) && ctrl->info.selector == mapping->selector) { __uvc_ctrl_add_mapping(chain, ctrl, mapping); custom = true; } } if (custom) return; } /* Process common mappings next. */ for (i = 0; i < ARRAY_SIZE(uvc_ctrl_mappings); ++i) { const struct uvc_control_mapping *mapping = &uvc_ctrl_mappings[i]; if (uvc_entity_match_guid(ctrl->entity, mapping->entity) && ctrl->info.selector == mapping->selector) __uvc_ctrl_add_mapping(chain, ctrl, mapping); } /* Finally process version-specific mappings. */ mappings = chain->dev->uvc_version < 0x0150 ? uvc_ctrl_mappings_uvc11 : uvc_ctrl_mappings_uvc15; for (i = 0; mappings[i]; ++i) { const struct uvc_control_mapping *mapping = mappings[i]; if (uvc_entity_match_guid(ctrl->entity, mapping->entity) && ctrl->info.selector == mapping->selector) __uvc_ctrl_add_mapping(chain, ctrl, mapping); } } /* * Initialize device controls. */ static int uvc_ctrl_init_chain(struct uvc_video_chain *chain) { struct uvc_entity *entity; unsigned int i; /* Walk the entities list and instantiate controls */ list_for_each_entry(entity, &chain->entities, chain) { struct uvc_control *ctrl; unsigned int bControlSize = 0, ncontrols; u8 *bmControls = NULL; if (UVC_ENTITY_TYPE(entity) == UVC_VC_EXTENSION_UNIT) { bmControls = entity->extension.bmControls; bControlSize = entity->extension.bControlSize; } else if (UVC_ENTITY_TYPE(entity) == UVC_VC_PROCESSING_UNIT) { bmControls = entity->processing.bmControls; bControlSize = entity->processing.bControlSize; } else if (UVC_ENTITY_TYPE(entity) == UVC_ITT_CAMERA) { bmControls = entity->camera.bmControls; bControlSize = entity->camera.bControlSize; } else if (UVC_ENTITY_TYPE(entity) == UVC_EXT_GPIO_UNIT) { bmControls = entity->gpio.bmControls; bControlSize = entity->gpio.bControlSize; } /* Remove bogus/blacklisted controls */ uvc_ctrl_prune_entity(chain->dev, entity); /* Count supported controls and allocate the controls array */ ncontrols = memweight(bmControls, bControlSize); if (ncontrols == 0) continue; entity->controls = kcalloc(ncontrols, sizeof(*ctrl), GFP_KERNEL); if (entity->controls == NULL) return -ENOMEM; entity->ncontrols = ncontrols; /* Initialize all supported controls */ ctrl = entity->controls; for (i = 0; i < bControlSize * 8; ++i) { if (uvc_test_bit(bmControls, i) == 0) continue; ctrl->entity = entity; ctrl->index = i; uvc_ctrl_init_ctrl(chain, ctrl); ctrl++; } } return 0; } int uvc_ctrl_init_device(struct uvc_device *dev) { struct uvc_video_chain *chain; int ret; INIT_WORK(&dev->async_ctrl.work, uvc_ctrl_status_event_work); list_for_each_entry(chain, &dev->chains, list) { ret = uvc_ctrl_init_chain(chain); if (ret) return ret; } return 0; } /* * Cleanup device controls. */ static void uvc_ctrl_cleanup_mappings(struct uvc_device *dev, struct uvc_control *ctrl) { struct uvc_control_mapping *mapping, *nm; list_for_each_entry_safe(mapping, nm, &ctrl->info.mappings, list) { list_del(&mapping->list); kfree(mapping->menu_names); kfree(mapping->menu_mapping); kfree(mapping->name); kfree(mapping); } } void uvc_ctrl_cleanup_device(struct uvc_device *dev) { struct uvc_entity *entity; unsigned int i; /* Can be uninitialized if we are aborting on probe error. */ if (dev->async_ctrl.work.func) cancel_work_sync(&dev->async_ctrl.work); /* Free controls and control mappings for all entities. */ list_for_each_entry(entity, &dev->entities, list) { for (i = 0; i < entity->ncontrols; ++i) { struct uvc_control *ctrl = &entity->controls[i]; if (!ctrl->initialized) continue; uvc_ctrl_cleanup_mappings(dev, ctrl); kfree(ctrl->uvc_data); } kfree(entity->controls); } }
4 5 5 4 9 5 4 9 9 6 9 6 9 1 7 4 3 5 5 1 4 4 4 6 5 2 3 3 6 1 4 3 5 3 2 5 3 2 6 3 3 3 3 3 3 3 3 3 3 3 3 3 2 4 6 6 6 6 9 3 8 2 9 6 6 6 1 6 2 4 6 6 6 6 6 6 1 1 2 18 31 23 25 7 4 62 7 3 10 61 73 73 71 55 8 8 51 4 17 2 27 6 2 17 78 66 65 12 53 3 2 2 1 1 1 1 12 3 9 7 4 2 4 1 1 3 13 13 12 2 2 13 2 2 13 7 4 3 1 1 6 13 116 2 11 112 5 12 72 3 12 65 13 132 87 12 12 1 24 220 19 6 9 274 13 12 16 13 229 14 174 25 12 130 7 15 119 118 1 3 5 14 3 4 4 34 34 13 7 14 14 17 2 1 1 1 82 66 16 9 7 12 2 2 2 1 1 1 1 2 1 1 2 1 1 19 19 67 1 3 3 1 1 1 5 5 5 5 1 1 2 2 5 4 5 4 18 11 9 9 2 9 5 5 1 2 2 8 1 4 1 2 2 16 3 13 16 16 2 16 1 16 6 16 7 16 16 1 1 67 3 2 67 18 65 65 65 64 60 5 64 11 54 2 51 5 49 4 38 12 2 39 39 1 121 122 16 106 21 106 91 4 77 3 3 1 73 36 32 7 39 16 16 16 1 1 1 2 1 1 1 1 2 2 2 2 2 2 13 2 2 35 15 2 18 10 8 14 2 17 15 14 1 1 1 10 9 1 37 1 1 1 34 13 1 20 2 1 1 2 1 1 438 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 /* Connection tracking via netlink socket. Allows for user space * protocol helpers and general trouble making from userspace. * * (C) 2001 by Jay Schulist <jschlst@samba.org> * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org> * (C) 2003 by Patrick Mchardy <kaber@trash.net> * (C) 2005-2012 by Pablo Neira Ayuso <pablo@netfilter.org> * * Initial connection tracking via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) * * Further development of this code funded by Astaro AG (http://www.astaro.com) * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/rculist.h> #include <linux/rculist_nulls.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/security.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netlink.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/siphash.h> #include <linux/netfilter.h> #include <net/netlink.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_synproxy.h> #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #endif #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include "nf_internals.h" MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("List and change connection tracking table"); struct ctnetlink_list_dump_ctx { struct nf_conn *last; unsigned int cpu; bool done; }; static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *l4proto) { int ret = 0; struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTO_NUM, tuple->dst.protonum)) goto nla_put_failure; if (likely(l4proto->tuple_to_nlattr)) ret = l4proto->tuple_to_nlattr(skb, tuple); nla_nest_end(skb, nest_parms); return ret; nla_put_failure: return -1; } static int ipv4_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) return -EMSGSIZE; return 0; } static int ipv6_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) || nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6)) return -EMSGSIZE; return 0; } static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { int ret = 0; struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, CTA_TUPLE_IP); if (!nest_parms) goto nla_put_failure; switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_tuple_to_nlattr(skb, tuple); break; case NFPROTO_IPV6: ret = ipv6_tuple_to_nlattr(skb, tuple); break; } nla_nest_end(skb, nest_parms); return ret; nla_put_failure: return -1; } static int ctnetlink_dump_tuples(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_l4proto *l4proto; int ret; rcu_read_lock(); ret = ctnetlink_dump_tuples_ip(skb, tuple); if (ret >= 0) { l4proto = nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); } rcu_read_unlock(); return ret; } static int ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype, const struct nf_conntrack_zone *zone, int dir) { if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir) return 0; if (nla_put_be16(skb, attrtype, htons(zone->id))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct, bool skip_zero) { long timeout; if (nf_ct_is_confirmed(ct)) timeout = nf_ct_expires(ct) / HZ; else timeout = ct->timeout / HZ; if (skip_zero && timeout == 0) return 0; if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct, bool destroy) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *nest_proto; int ret; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->to_nlattr) return 0; nest_proto = nla_nest_start(skb, CTA_PROTOINFO); if (!nest_proto) goto nla_put_failure; ret = l4proto->to_nlattr(skb, nest_proto, ct, destroy); nla_nest_end(skb, nest_proto); return ret; nla_put_failure: return -1; } static int ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_helper; const struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_helper *helper; if (!help) return 0; rcu_read_lock(); helper = rcu_dereference(help->helper); if (!helper) goto out; nest_helper = nla_nest_start(skb, CTA_HELP); if (!nest_helper) goto nla_put_failure; if (nla_put_string(skb, CTA_HELP_NAME, helper->name)) goto nla_put_failure; if (helper->to_nlattr) helper->to_nlattr(skb, ct); nla_nest_end(skb, nest_helper); out: rcu_read_unlock(); return 0; nla_put_failure: rcu_read_unlock(); return -1; } static int dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct, enum ip_conntrack_dir dir, int type) { enum ctattr_type attr = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; struct nf_conn_counter *counter = acct->counter; struct nlattr *nest_count; u64 pkts, bytes; if (type == IPCTNL_MSG_CT_GET_CTRZERO) { pkts = atomic64_xchg(&counter[dir].packets, 0); bytes = atomic64_xchg(&counter[dir].bytes, 0); } else { pkts = atomic64_read(&counter[dir].packets); bytes = atomic64_read(&counter[dir].bytes); } nest_count = nla_nest_start(skb, attr); if (!nest_count) goto nla_put_failure; if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts), CTA_COUNTERS_PAD) || nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes), CTA_COUNTERS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest_count); return 0; nla_put_failure: return -1; } static int ctnetlink_dump_acct(struct sk_buff *skb, const struct nf_conn *ct, int type) { struct nf_conn_acct *acct = nf_conn_acct_find(ct); if (!acct) return 0; if (dump_counters(skb, acct, IP_CT_DIR_ORIGINAL, type) < 0) return -1; if (dump_counters(skb, acct, IP_CT_DIR_REPLY, type) < 0) return -1; return 0; } static int ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_count; const struct nf_conn_tstamp *tstamp; tstamp = nf_conn_tstamp_find(ct); if (!tstamp) return 0; nest_count = nla_nest_start(skb, CTA_TIMESTAMP); if (!nest_count) goto nla_put_failure; if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start), CTA_TIMESTAMP_PAD) || (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP, cpu_to_be64(tstamp->stop), CTA_TIMESTAMP_PAD))) goto nla_put_failure; nla_nest_end(skb, nest_count); return 0; nla_put_failure: return -1; } #ifdef CONFIG_NF_CONNTRACK_MARK static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct, bool dump) { u32 mark = READ_ONCE(ct->mark); if (!mark && !dump) return 0; if (nla_put_be32(skb, CTA_MARK, htonl(mark))) goto nla_put_failure; return 0; nla_put_failure: return -1; } #else #define ctnetlink_dump_mark(a, b, c) (0) #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_secctx; int len, ret; char *secctx; ret = security_secid_to_secctx(ct->secmark, &secctx, &len); if (ret) return 0; ret = -1; nest_secctx = nla_nest_start(skb, CTA_SECCTX); if (!nest_secctx) goto nla_put_failure; if (nla_put_string(skb, CTA_SECCTX_NAME, secctx)) goto nla_put_failure; nla_nest_end(skb, nest_secctx); ret = 0; nla_put_failure: security_release_secctx(secctx, len); return ret; } #else #define ctnetlink_dump_secctx(a, b) (0) #endif #ifdef CONFIG_NF_CONNTRACK_LABELS static inline int ctnetlink_label_size(const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); if (!labels) return 0; return nla_total_size(sizeof(labels->bits)); } static int ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); unsigned int i; if (!labels) return 0; i = 0; do { if (labels->bits[i] != 0) return nla_put(skb, CTA_LABELS, sizeof(labels->bits), labels->bits); i++; } while (i < ARRAY_SIZE(labels->bits)); return 0; } #else #define ctnetlink_dump_labels(a, b) (0) #define ctnetlink_label_size(a) (0) #endif #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) static int ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_parms; if (!(ct->status & IPS_EXPECTED)) return 0; nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, type); if (!nest_parms) goto nla_put_failure; if (nla_put_be32(skb, CTA_SEQADJ_CORRECTION_POS, htonl(seq->correction_pos)) || nla_put_be32(skb, CTA_SEQADJ_OFFSET_BEFORE, htonl(seq->offset_before)) || nla_put_be32(skb, CTA_SEQADJ_OFFSET_AFTER, htonl(seq->offset_after))) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *seq; if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; spin_lock_bh(&ct->lock); seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) goto err; seq = &seqadj->seq[IP_CT_DIR_REPLY]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) goto err; spin_unlock_bh(&ct->lock); return 0; err: spin_unlock_bh(&ct->lock); return -1; } static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); struct nlattr *nest_parms; if (!synproxy) return 0; nest_parms = nla_nest_start(skb, CTA_SYNPROXY); if (!nest_parms) goto nla_put_failure; if (nla_put_be32(skb, CTA_SYNPROXY_ISN, htonl(synproxy->isn)) || nla_put_be32(skb, CTA_SYNPROXY_ITS, htonl(synproxy->its)) || nla_put_be32(skb, CTA_SYNPROXY_TSOFF, htonl(synproxy->tsoff))) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { __be32 id = (__force __be32)nf_ct_get_id(ct); if (nla_put_be32(skb, CTA_ID, id)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use)))) goto nla_put_failure; return 0; nla_put_failure: return -1; } /* all these functions access ct->ext. Caller must either hold a reference * on ct or prevent its deletion by holding either the bucket spinlock or * pcpu dying list lock. */ static int ctnetlink_dump_extinfo(struct sk_buff *skb, struct nf_conn *ct, u32 type) { if (ctnetlink_dump_acct(skb, ct, type) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_labels(skb, ct) < 0 || ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || ctnetlink_dump_ct_synproxy(skb, ct) < 0) return -1; return 0; } static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) { if (ctnetlink_dump_status(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct, true) < 0 || ctnetlink_dump_secctx(skb, ct) < 0 || ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0) return -1; if (!test_bit(IPS_OFFLOAD_BIT, &ct->status) && (ctnetlink_dump_timeout(skb, ct, false) < 0 || ctnetlink_dump_protoinfo(skb, ct, false) < 0)) return -1; return 0; } static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nf_conn *ct, bool extinfo, unsigned int flags) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; struct nlattr *nest_parms; unsigned int event; if (portid) flags |= NLM_F_MULTI; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct), NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_ORIG) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_REPL) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_info(skb, ct) < 0) goto nla_put_failure; if (extinfo && ctnetlink_dump_extinfo(skb, ct, type) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = { [CTA_IP_V4_SRC] = { .type = NLA_U32 }, [CTA_IP_V4_DST] = { .type = NLA_U32 }, [CTA_IP_V6_SRC] = { .len = sizeof(__be32) * 4 }, [CTA_IP_V6_DST] = { .len = sizeof(__be32) * 4 }, }; #if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS) static size_t ctnetlink_proto_size(const struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; size_t len, len4 = 0; len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1); len *= 3u; /* ORIG, REPLY, MASTER */ l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); len += l4proto->nlattr_size; if (l4proto->nlattr_tuple_size) { len4 = l4proto->nlattr_tuple_size(); len4 *= 3u; /* ORIG, REPLY, MASTER */ } return len + len4; } #endif static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT)) return 0; return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */ + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */ + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */ ; } static inline int ctnetlink_secctx_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_SECMARK int len, ret; ret = security_secid_to_secctx(ct->secmark, NULL, &len); if (ret) return 0; return nla_total_size(0) /* CTA_SECCTX */ + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */ #else return 0; #endif } static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) return 0; return nla_total_size(0) + 2 * nla_total_size_64bit(sizeof(uint64_t)); #else return 0; #endif } #ifdef CONFIG_NF_CONNTRACK_EVENTS static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) { return NLMSG_ALIGN(sizeof(struct nfgenmsg)) + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + ctnetlink_acct_size(ct) + ctnetlink_timestamp_size(ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + nla_total_size(0) /* CTA_PROTOINFO */ + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) #if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ #endif #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) ; } static int ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) { const struct nf_conntrack_zone *zone; struct net *net; struct nlmsghdr *nlh; struct nlattr *nest_parms; struct nf_conn *ct = item->ct; struct sk_buff *skb; unsigned int type; unsigned int flags = 0, group; int err; if (events & (1 << IPCT_DESTROY)) { type = IPCTNL_MSG_CT_DELETE; group = NFNLGRP_CONNTRACK_DESTROY; } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) { type = IPCTNL_MSG_CT_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; group = NFNLGRP_CONNTRACK_NEW; } else if (events) { type = IPCTNL_MSG_CT_NEW; group = NFNLGRP_CONNTRACK_UPDATE; } else return 0; net = nf_ct_net(ct); if (!item->report && !nfnetlink_has_listeners(net, group)) return 0; skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC); if (skb == NULL) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type); nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct), NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_ORIG) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_REPL) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_DESTROY)) { if (ctnetlink_dump_timeout(skb, ct, true) < 0) goto nla_put_failure; if (ctnetlink_dump_acct(skb, ct, type) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0 || ctnetlink_dump_protoinfo(skb, ct, true) < 0) goto nla_put_failure; } else { if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; if (events & (1 << IPCT_PROTOINFO) && ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if ((events & (1 << IPCT_HELPER) || nfct_help(ct)) && ctnetlink_dump_helpinfo(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_SECMARK if ((events & (1 << IPCT_SECMARK) || ct->secmark) && ctnetlink_dump_secctx(skb, ct) < 0) goto nla_put_failure; #endif if (events & (1 << IPCT_LABEL) && ctnetlink_dump_labels(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_RELATED) && ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_SEQADJ) && ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; if (events & (1 << IPCT_SYNPROXY) && ctnetlink_dump_ct_synproxy(skb, ct) < 0) goto nla_put_failure; } #ifdef CONFIG_NF_CONNTRACK_MARK if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK))) goto nla_put_failure; #endif nlmsg_end(skb, nlh); err = nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); if (err == -ENOBUFS || err == -EAGAIN) return -ENOBUFS; return 0; nla_put_failure: nlmsg_cancel(skb, nlh); nlmsg_failure: kfree_skb(skb); errout: if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0) return -ENOBUFS; return 0; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ static int ctnetlink_done(struct netlink_callback *cb) { if (cb->args[1]) nf_ct_put((struct nf_conn *)cb->args[1]); kfree(cb->data); return 0; } struct ctnetlink_filter_u32 { u32 val; u32 mask; }; struct ctnetlink_filter { u8 family; bool zone_filter; u_int32_t orig_flags; u_int32_t reply_flags; struct nf_conntrack_tuple orig; struct nf_conntrack_tuple reply; struct nf_conntrack_zone zone; struct ctnetlink_filter_u32 mark; struct ctnetlink_filter_u32 status; }; static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { [CTA_FILTER_ORIG_FLAGS] = { .type = NLA_U32 }, [CTA_FILTER_REPLY_FLAGS] = { .type = NLA_U32 }, }; static int ctnetlink_parse_filter(const struct nlattr *attr, struct ctnetlink_filter *filter) { struct nlattr *tb[CTA_FILTER_MAX + 1]; int ret = 0; ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy, NULL); if (ret) return ret; if (tb[CTA_FILTER_ORIG_FLAGS]) { filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]); if (filter->orig_flags & ~CTA_FILTER_F_ALL) return -EOPNOTSUPP; } if (tb[CTA_FILTER_REPLY_FLAGS]) { filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]); if (filter->reply_flags & ~CTA_FILTER_F_ALL) return -EOPNOTSUPP; } return 0; } static int ctnetlink_parse_zone(const struct nlattr *attr, struct nf_conntrack_zone *zone); static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, u32 type, u_int8_t l3num, struct nf_conntrack_zone *zone, u_int32_t flags); static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark, const struct nlattr * const cda[]) { #ifdef CONFIG_NF_CONNTRACK_MARK if (cda[CTA_MARK]) { mark->val = ntohl(nla_get_be32(cda[CTA_MARK])); if (cda[CTA_MARK_MASK]) mark->mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); else mark->mask = 0xffffffff; } else if (cda[CTA_MARK_MASK]) { return -EINVAL; } #endif return 0; } static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status, const struct nlattr * const cda[]) { if (cda[CTA_STATUS]) { status->val = ntohl(nla_get_be32(cda[CTA_STATUS])); if (cda[CTA_STATUS_MASK]) status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK])); else status->mask = status->val; /* status->val == 0? always true, else always false. */ if (status->mask == 0) return -EINVAL; } else if (cda[CTA_STATUS_MASK]) { return -EINVAL; } /* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */ BUILD_BUG_ON(__IPS_MAX_BIT >= 32); return 0; } static struct ctnetlink_filter * ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { struct ctnetlink_filter *filter; int err; #ifndef CONFIG_NF_CONNTRACK_MARK if (cda[CTA_MARK] || cda[CTA_MARK_MASK]) return ERR_PTR(-EOPNOTSUPP); #endif filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (filter == NULL) return ERR_PTR(-ENOMEM); filter->family = family; err = ctnetlink_filter_parse_mark(&filter->mark, cda); if (err) goto err_filter; err = ctnetlink_filter_parse_status(&filter->status, cda); if (err) goto err_filter; if (cda[CTA_ZONE]) { err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); if (err < 0) goto err_filter; filter->zone_filter = true; } if (!cda[CTA_FILTER]) return filter; err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); if (err < 0) goto err_filter; if (filter->orig_flags) { if (!cda[CTA_TUPLE_ORIG]) { err = -EINVAL; goto err_filter; } err = ctnetlink_parse_tuple_filter(cda, &filter->orig, CTA_TUPLE_ORIG, filter->family, &filter->zone, filter->orig_flags); if (err < 0) goto err_filter; } if (filter->reply_flags) { if (!cda[CTA_TUPLE_REPLY]) { err = -EINVAL; goto err_filter; } err = ctnetlink_parse_tuple_filter(cda, &filter->reply, CTA_TUPLE_REPLY, filter->family, &filter->zone, filter->reply_flags); if (err < 0) goto err_filter; } return filter; err_filter: kfree(filter); return ERR_PTR(err); } static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) { return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS] || cda[CTA_ZONE]; } static int ctnetlink_start(struct netlink_callback *cb) { const struct nlattr * const *cda = cb->data; struct ctnetlink_filter *filter = NULL; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u8 family = nfmsg->nfgen_family; if (ctnetlink_needs_filter(family, cda)) { filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); } cb->data = filter; return 0; } static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple, struct nf_conntrack_tuple *ct_tuple, u_int32_t flags, int family) { switch (family) { case NFPROTO_IPV4: if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && filter_tuple->src.u3.ip != ct_tuple->src.u3.ip) return 0; if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip) return 0; break; case NFPROTO_IPV6: if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && !ipv6_addr_cmp(&filter_tuple->src.u3.in6, &ct_tuple->src.u3.in6)) return 0; if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && !ipv6_addr_cmp(&filter_tuple->dst.u3.in6, &ct_tuple->dst.u3.in6)) return 0; break; } if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) && filter_tuple->dst.protonum != ct_tuple->dst.protonum) return 0; switch (ct_tuple->dst.protonum) { case IPPROTO_TCP: case IPPROTO_UDP: if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) && filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) && filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port) return 0; break; case IPPROTO_ICMP: if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) && filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) && filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) && filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) return 0; break; case IPPROTO_ICMPV6: if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) && filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) && filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) return 0; if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) && filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) return 0; break; } return 1; } static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; struct nf_conntrack_tuple *tuple; u32 status; if (filter == NULL) goto out; /* Match entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then match everything. */ if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; if (filter->zone_filter && !nf_ct_zone_equal_any(ct, &filter->zone)) goto ignore_entry; if (filter->orig_flags) { tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL); if (!ctnetlink_filter_match_tuple(&filter->orig, tuple, filter->orig_flags, filter->family)) goto ignore_entry; } if (filter->reply_flags) { tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY); if (!ctnetlink_filter_match_tuple(&filter->reply, tuple, filter->reply_flags, filter->family)) goto ignore_entry; } #ifdef CONFIG_NF_CONNTRACK_MARK if ((READ_ONCE(ct->mark) & filter->mark.mask) != filter->mark.val) goto ignore_entry; #endif status = (u32)READ_ONCE(ct->status); if ((status & filter->status.mask) != filter->status.val) goto ignore_entry; out: return 1; ignore_entry: return 0; } static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nf_conn *nf_ct_evict[8]; int res, i; spinlock_t *lockp; last = (struct nf_conn *)cb->args[1]; i = 0; local_bh_disable(); for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: while (i) { i--; if (nf_ct_should_gc(nf_ct_evict[i])) nf_ct_kill(nf_ct_evict[i]); nf_ct_put(nf_ct_evict[i]); } lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); if (cb->args[0] >= nf_conntrack_htable_size) { spin_unlock(lockp); goto out; } hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(ct)) { /* need to defer nf_ct_kill() until lock is released */ if (i < ARRAY_SIZE(nf_ct_evict) && refcount_inc_not_zero(&ct->ct_general.use)) nf_ct_evict[i++] = ct; continue; } if (!net_eq(net, nf_ct_net(ct))) continue; if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; if (cb->args[1]) { if (ct != last) continue; cb->args[1] = 0; } if (!ctnetlink_filter_match(ct, cb->data)) continue; res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, true, flags); if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; spin_unlock(lockp); goto out; } } spin_unlock(lockp); if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: local_bh_enable(); if (last) { /* nf ct hash resize happened, now clear the leftover. */ if ((struct nf_conn *)cb->args[1] == last) cb->args[1] = 0; nf_ct_put(last); } while (i) { i--; if (nf_ct_should_gc(nf_ct_evict[i])) nf_ct_kill(nf_ct_evict[i]); nf_ct_put(nf_ct_evict[i]); } return skb->len; } static int ipv4_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { if (!tb[CTA_IP_V4_SRC]) return -EINVAL; t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); } if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { if (!tb[CTA_IP_V4_DST]) return -EINVAL; t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); } return 0; } static int ipv6_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags) { if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { if (!tb[CTA_IP_V6_SRC]) return -EINVAL; t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); } if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { if (!tb[CTA_IP_V6_DST]) return -EINVAL; t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); } return 0; } static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple, u_int32_t flags) { struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; ret = nla_parse_nested_deprecated(tb, CTA_IP_MAX, attr, cta_ip_nla_policy, NULL); if (ret < 0) return ret; switch (tuple->src.l3num) { case NFPROTO_IPV4: ret = ipv4_nlattr_to_tuple(tb, tuple, flags); break; case NFPROTO_IPV6: ret = ipv6_nlattr_to_tuple(tb, tuple, flags); break; } return ret; } static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_NUM] = { .type = NLA_U8 }, }; static int ctnetlink_parse_tuple_proto(struct nlattr *attr, struct nf_conntrack_tuple *tuple, u_int32_t flags) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTO_MAX+1]; int ret = 0; ret = nla_parse_nested_deprecated(tb, CTA_PROTO_MAX, attr, proto_nla_policy, NULL); if (ret < 0) return ret; if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM))) return 0; if (!tb[CTA_PROTO_NUM]) return -EINVAL; tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); rcu_read_lock(); l4proto = nf_ct_l4proto_find(tuple->dst.protonum); if (likely(l4proto->nlattr_to_tuple)) { ret = nla_validate_nested_deprecated(attr, CTA_PROTO_MAX, l4proto->nla_policy, NULL); if (ret == 0) ret = l4proto->nlattr_to_tuple(tb, tuple, flags); } rcu_read_unlock(); return ret; } static int ctnetlink_parse_zone(const struct nlattr *attr, struct nf_conntrack_zone *zone) { nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID, NF_CT_DEFAULT_ZONE_DIR, 0); #ifdef CONFIG_NF_CONNTRACK_ZONES if (attr) zone->id = ntohs(nla_get_be16(attr)); #else if (attr) return -EOPNOTSUPP; #endif return 0; } static int ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type, struct nf_conntrack_zone *zone) { int ret; if (zone->id != NF_CT_DEFAULT_ZONE_ID) return -EINVAL; ret = ctnetlink_parse_zone(attr, zone); if (ret < 0) return ret; if (type == CTA_TUPLE_REPLY) zone->dir = NF_CT_ZONE_DIR_REPL; else zone->dir = NF_CT_ZONE_DIR_ORIG; return 0; } static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { [CTA_TUPLE_IP] = { .type = NLA_NESTED }, [CTA_TUPLE_PROTO] = { .type = NLA_NESTED }, [CTA_TUPLE_ZONE] = { .type = NLA_U16 }, }; #define CTA_FILTER_F_ALL_CTA_PROTO \ (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \ CTA_FILTER_F_CTA_PROTO_DST_PORT | \ CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \ CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \ CTA_FILTER_F_CTA_PROTO_ICMP_ID | \ CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \ CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \ CTA_FILTER_F_CTA_PROTO_ICMPV6_ID) static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, u32 type, u_int8_t l3num, struct nf_conntrack_zone *zone, u_int32_t flags) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; memset(tuple, 0, sizeof(*tuple)); err = nla_parse_nested_deprecated(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy, NULL); if (err < 0) return err; if (l3num != NFPROTO_IPV4 && l3num != NFPROTO_IPV6) return -EOPNOTSUPP; tuple->src.l3num = l3num; if (flags & CTA_FILTER_FLAG(CTA_IP_DST) || flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { if (!tb[CTA_TUPLE_IP]) return -EINVAL; err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags); if (err < 0) return err; } if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) { if (!tb[CTA_TUPLE_PROTO]) return -EINVAL; err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags); if (err < 0) return err; } else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) { /* Can't manage proto flags without a protonum */ return -EINVAL; } if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) { if (!zone) return -EINVAL; err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE], type, zone); if (err < 0) return err; } /* orig and expect tuples get DIR_ORIGINAL */ if (type == CTA_TUPLE_REPLY) tuple->dst.dir = IP_CT_DIR_REPLY; else tuple->dst.dir = IP_CT_DIR_ORIGINAL; return 0; } static int ctnetlink_parse_tuple(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, u32 type, u_int8_t l3num, struct nf_conntrack_zone *zone) { return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone, CTA_FILTER_FLAG(ALL)); } static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, }; static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, struct nlattr **helpinfo) { int err; struct nlattr *tb[CTA_HELP_MAX+1]; err = nla_parse_nested_deprecated(tb, CTA_HELP_MAX, attr, help_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_HELP_NAME]) return -EINVAL; *helper_name = nla_data(tb[CTA_HELP_NAME]); if (tb[CTA_HELP_INFO]) *helpinfo = tb[CTA_HELP_INFO]; return 0; } static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_TUPLE_ORIG] = { .type = NLA_NESTED }, [CTA_TUPLE_REPLY] = { .type = NLA_NESTED }, [CTA_STATUS] = { .type = NLA_U32 }, [CTA_PROTOINFO] = { .type = NLA_NESTED }, [CTA_HELP] = { .type = NLA_NESTED }, [CTA_NAT_SRC] = { .type = NLA_NESTED }, [CTA_TIMEOUT] = { .type = NLA_U32 }, [CTA_MARK] = { .type = NLA_U32 }, [CTA_ID] = { .type = NLA_U32 }, [CTA_NAT_DST] = { .type = NLA_NESTED }, [CTA_TUPLE_MASTER] = { .type = NLA_NESTED }, [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NLA_NESTED }, [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NLA_NESTED }, [CTA_ZONE] = { .type = NLA_U16 }, [CTA_MARK_MASK] = { .type = NLA_U32 }, [CTA_LABELS] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, [CTA_LABELS_MASK] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, [CTA_FILTER] = { .type = NLA_NESTED }, [CTA_STATUS_MASK] = { .type = NLA_U32 }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) { return ctnetlink_filter_match(ct, data); } static int ctnetlink_flush_conntrack(struct net *net, const struct nlattr * const cda[], u32 portid, int report, u8 family) { struct ctnetlink_filter *filter = NULL; struct nf_ct_iter_data iter = { .net = net, .portid = portid, .report = report, }; if (ctnetlink_needs_filter(family, cda)) { if (cda[CTA_FILTER]) return -EOPNOTSUPP; filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); iter.data = filter; } nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter); kfree(filter); return 0; } static int ctnetlink_del_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u8 family = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) return err; if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, family, &zone); else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, family, &zone); else { u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC; return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, nlmsg_report(info->nlh), u3); } if (err < 0) return err; h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); if (cda[CTA_ID]) { __be32 id = nla_get_be32(cda[CTA_ID]); if (id != (__force __be32)nf_ct_get_id(ct)) { nf_ct_put(ct); return -ENOENT; } } nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_put(ct); return 0; } static int ctnetlink_get_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; struct sk_buff *skb2; struct nf_conn *ct; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = ctnetlink_start, .dump = ctnetlink_dump_table, .done = ctnetlink_done, .data = (void *)cda, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) return err; if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3, &zone); else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3, &zone); else return -EINVAL; if (err < 0) return err; h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) { nf_ct_put(ct); return -ENOMEM; } err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); if (err <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static int ctnetlink_done_list(struct netlink_callback *cb) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; if (ctx->last) nf_ct_put(ctx->last); return 0; } #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_dump_one_entry(struct sk_buff *skb, struct netlink_callback *cb, struct nf_conn *ct, bool dying) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u8 l3proto = nfmsg->nfgen_family; int res; if (l3proto && nf_ct_l3num(ct) != l3proto) return 0; if (ctx->last) { if (ct != ctx->last) return 0; ctx->last = NULL; } /* We can't dump extension info for the unconfirmed * list because unconfirmed conntracks can have * ct->ext reallocated (and thus freed). * * In the dying list case ct->ext can't be free'd * until after we drop pcpu->lock. */ res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying, 0); if (res < 0) { if (!refcount_inc_not_zero(&ct->ct_general.use)) return 0; ctx->last = ct; } return res; } #endif static int ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) { return 0; } static int ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; struct nf_conn *last = ctx->last; #ifdef CONFIG_NF_CONNTRACK_EVENTS const struct net *net = sock_net(skb->sk); struct nf_conntrack_net_ecache *ecache_net; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; #endif if (ctx->done) return 0; ctx->last = NULL; #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache_net = nf_conn_pernet_ecache(net); spin_lock_bh(&ecache_net->dying_lock); hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) { struct nf_conn *ct; int res; ct = nf_ct_tuplehash_to_ctrack(h); if (last && last != ct) continue; res = ctnetlink_dump_one_entry(skb, cb, ct, true); if (res < 0) { spin_unlock_bh(&ecache_net->dying_lock); nf_ct_put(last); return skb->len; } nf_ct_put(last); last = NULL; } spin_unlock_bh(&ecache_net->dying_lock); #endif ctx->done = true; nf_ct_put(last); return skb->len; } static int ctnetlink_get_ct_dying(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; } static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; } #if IS_ENABLED(CONFIG_NF_NAT) static int ctnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) __must_hold(RCU) { const struct nf_nat_hook *nat_hook; int err; nat_hook = rcu_dereference(nf_nat_hook); if (!nat_hook) { #ifdef CONFIG_MODULES rcu_read_unlock(); nfnl_unlock(NFNL_SUBSYS_CTNETLINK); if (request_module("nf-nat") < 0) { nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); return -EOPNOTSUPP; } nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook) return -EAGAIN; #endif return -EOPNOTSUPP; } err = nat_hook->parse_nat_setup(ct, manip, attr); if (err == -EAGAIN) { #ifdef CONFIG_MODULES rcu_read_unlock(); nfnl_unlock(NFNL_SUBSYS_CTNETLINK); if (request_module("nf-nat-%u", nf_ct_l3num(ct)) < 0) { nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); return -EOPNOTSUPP; } nfnl_lock(NFNL_SUBSYS_CTNETLINK); rcu_read_lock(); #else err = -EOPNOTSUPP; #endif } return err; } #endif static int ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) { return nf_ct_change_status_common(ct, ntohl(nla_get_be32(cda[CTA_STATUS]))); } static int ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) { #if IS_ENABLED(CONFIG_NF_NAT) int ret; if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) return 0; ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_DST, cda[CTA_NAT_DST]); if (ret < 0) return ret; return ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, cda[CTA_NAT_SRC]); #else if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) return 0; return -EOPNOTSUPP; #endif } static int ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) { struct nf_conntrack_helper *helper; struct nf_conn_help *help = nfct_help(ct); char *helpname = NULL; struct nlattr *helpinfo = NULL; int err; err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); if (err < 0) return err; /* don't change helper of sibling connections */ if (ct->master) { /* If we try to change the helper to the same thing twice, * treat the second attempt as a no-op instead of returning * an error. */ err = -EBUSY; if (help) { rcu_read_lock(); helper = rcu_dereference(help->helper); if (helper && !strcmp(helper->name, helpname)) err = 0; rcu_read_unlock(); } return err; } if (!strcmp(helpname, "")) { if (help && help->helper) { /* we had a helper before ... */ nf_ct_remove_expectations(ct); RCU_INIT_POINTER(help->helper, NULL); } return 0; } rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); return -EOPNOTSUPP; } if (help) { if (rcu_access_pointer(help->helper) == helper) { /* update private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); err = 0; } else err = -EBUSY; } else { /* we cannot set a helper for an existing conntrack */ err = -EOPNOTSUPP; } rcu_read_unlock(); return err; } static int ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) { return __nf_ct_change_timeout(ct, (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ); } #if defined(CONFIG_NF_CONNTRACK_MARK) static void ctnetlink_change_mark(struct nf_conn *ct, const struct nlattr * const cda[]) { u32 mark, newmark, mask = 0; if (cda[CTA_MARK_MASK]) mask = ~ntohl(nla_get_be32(cda[CTA_MARK_MASK])); mark = ntohl(nla_get_be32(cda[CTA_MARK])); newmark = (READ_ONCE(ct->mark) & mask) ^ mark; if (newmark != READ_ONCE(ct->mark)) WRITE_ONCE(ct->mark, newmark); } #endif static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED }, [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, }; static int ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]) { const struct nlattr *attr = cda[CTA_PROTOINFO]; const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTOINFO_MAX+1]; int err = 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy, NULL); if (err < 0) return err; l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->from_nlattr) err = l4proto->from_nlattr(tb, ct); return err; } static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { [CTA_SEQADJ_CORRECTION_POS] = { .type = NLA_U32 }, [CTA_SEQADJ_OFFSET_BEFORE] = { .type = NLA_U32 }, [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 }, }; static int change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr) { int err; struct nlattr *cda[CTA_SEQADJ_MAX+1]; err = nla_parse_nested_deprecated(cda, CTA_SEQADJ_MAX, attr, seqadj_policy, NULL); if (err < 0) return err; if (!cda[CTA_SEQADJ_CORRECTION_POS]) return -EINVAL; seq->correction_pos = ntohl(nla_get_be32(cda[CTA_SEQADJ_CORRECTION_POS])); if (!cda[CTA_SEQADJ_OFFSET_BEFORE]) return -EINVAL; seq->offset_before = ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_BEFORE])); if (!cda[CTA_SEQADJ_OFFSET_AFTER]) return -EINVAL; seq->offset_after = ntohl(nla_get_be32(cda[CTA_SEQADJ_OFFSET_AFTER])); return 0; } static int ctnetlink_change_seq_adj(struct nf_conn *ct, const struct nlattr * const cda[]) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); int ret = 0; if (!seqadj) return 0; spin_lock_bh(&ct->lock); if (cda[CTA_SEQ_ADJ_ORIG]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) goto err; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } if (cda[CTA_SEQ_ADJ_REPLY]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) goto err; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } spin_unlock_bh(&ct->lock); return 0; err: spin_unlock_bh(&ct->lock); return ret; } static const struct nla_policy synproxy_policy[CTA_SYNPROXY_MAX + 1] = { [CTA_SYNPROXY_ISN] = { .type = NLA_U32 }, [CTA_SYNPROXY_ITS] = { .type = NLA_U32 }, [CTA_SYNPROXY_TSOFF] = { .type = NLA_U32 }, }; static int ctnetlink_change_synproxy(struct nf_conn *ct, const struct nlattr * const cda[]) { struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); struct nlattr *tb[CTA_SYNPROXY_MAX + 1]; int err; if (!synproxy) return 0; err = nla_parse_nested_deprecated(tb, CTA_SYNPROXY_MAX, cda[CTA_SYNPROXY], synproxy_policy, NULL); if (err < 0) return err; if (!tb[CTA_SYNPROXY_ISN] || !tb[CTA_SYNPROXY_ITS] || !tb[CTA_SYNPROXY_TSOFF]) return -EINVAL; synproxy->isn = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ISN])); synproxy->its = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ITS])); synproxy->tsoff = ntohl(nla_get_be32(tb[CTA_SYNPROXY_TSOFF])); return 0; } static int ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) { #ifdef CONFIG_NF_CONNTRACK_LABELS size_t len = nla_len(cda[CTA_LABELS]); const void *mask = cda[CTA_LABELS_MASK]; if (len & (sizeof(u32)-1)) /* must be multiple of u32 */ return -EINVAL; if (mask) { if (nla_len(cda[CTA_LABELS_MASK]) == 0 || nla_len(cda[CTA_LABELS_MASK]) != len) return -EINVAL; mask = nla_data(cda[CTA_LABELS_MASK]); } len /= sizeof(u32); return nf_connlabels_replace(ct, nla_data(cda[CTA_LABELS]), mask, len); #else return -EOPNOTSUPP; #endif } static int ctnetlink_change_conntrack(struct nf_conn *ct, const struct nlattr * const cda[]) { int err; /* only allow NAT changes and master assignation for new conntracks */ if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER]) return -EOPNOTSUPP; if (cda[CTA_HELP]) { err = ctnetlink_change_helper(ct, cda); if (err < 0) return err; } if (cda[CTA_TIMEOUT]) { err = ctnetlink_change_timeout(ct, cda); if (err < 0) return err; } if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) return err; } if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) return err; } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ctnetlink_change_mark(ct, cda); #endif if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) return err; } if (cda[CTA_SYNPROXY]) { err = ctnetlink_change_synproxy(ct, cda); if (err < 0) return err; } if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) return err; } return 0; } static struct nf_conn * ctnetlink_create_conntrack(struct net *net, const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], struct nf_conntrack_tuple *otuple, struct nf_conntrack_tuple *rtuple, u8 u3) { struct nf_conn *ct; int err = -EINVAL; struct nf_conntrack_helper *helper; struct nf_conn_tstamp *tstamp; u64 timeout; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) return ERR_PTR(-ENOMEM); if (!cda[CTA_TIMEOUT]) goto err1; rcu_read_lock(); if (cda[CTA_HELP]) { char *helpname = NULL; struct nlattr *helpinfo = NULL; err = ctnetlink_parse_help(cda[CTA_HELP], &helpname, &helpinfo); if (err < 0) goto err2; helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { err = -EOPNOTSUPP; goto err1; } rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper) { err = -EAGAIN; goto err2; } rcu_read_unlock(); #endif err = -EOPNOTSUPP; goto err1; } else { struct nf_conn_help *help; help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help == NULL) { err = -ENOMEM; goto err2; } /* set private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); /* disable helper auto-assignment for this entry */ ct->status |= IPS_HELPER; RCU_INIT_POINTER(help->helper, helper); } } err = ctnetlink_setup_nat(ct, cda); if (err < 0) goto err2; nf_ct_acct_ext_add(ct, GFP_ATOMIC); nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); nf_ct_labels_ext_add(ct); nfct_seqadj_ext_add(ct); nfct_synproxy_ext_add(ct); /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; __nf_ct_set_timeout(ct, timeout); if (cda[CTA_STATUS]) { err = ctnetlink_change_status(ct, cda); if (err < 0) goto err2; } if (cda[CTA_SEQ_ADJ_ORIG] || cda[CTA_SEQ_ADJ_REPLY]) { err = ctnetlink_change_seq_adj(ct, cda); if (err < 0) goto err2; } memset(&ct->proto, 0, sizeof(ct->proto)); if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) goto err2; } if (cda[CTA_SYNPROXY]) { err = ctnetlink_change_synproxy(ct, cda); if (err < 0) goto err2; } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ctnetlink_change_mark(ct, cda); #endif /* setup master conntrack: this is a confirmed expectation */ if (cda[CTA_TUPLE_MASTER]) { struct nf_conntrack_tuple master; struct nf_conntrack_tuple_hash *master_h; struct nf_conn *master_ct; err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3, NULL); if (err < 0) goto err2; master_h = nf_conntrack_find_get(net, zone, &master); if (master_h == NULL) { err = -ENOENT; goto err2; } master_ct = nf_ct_tuplehash_to_ctrack(master_h); __set_bit(IPS_EXPECTED_BIT, &ct->status); ct->master = master_ct; } tstamp = nf_conn_tstamp_find(ct); if (tstamp) tstamp->start = ktime_get_real_ns(); err = nf_conntrack_hash_check_insert(ct); if (err < 0) goto err3; rcu_read_unlock(); return ct; err3: if (ct->master) nf_ct_put(ct->master); err2: rcu_read_unlock(); err1: nf_conntrack_free(ct); return ERR_PTR(err); } static int ctnetlink_new_conntrack(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_zone zone; struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); if (err < 0) return err; if (cda[CTA_TUPLE_ORIG]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_REPLY]) { err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_ORIG]) h = nf_conntrack_find_get(info->net, &zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) h = nf_conntrack_find_get(info->net, &zone, &rtuple); if (h == NULL) { err = -ENOENT; if (info->nlh->nlmsg_flags & NLM_F_CREATE) { enum ip_conntrack_events events; if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) return -EINVAL; if (otuple.dst.protonum != rtuple.dst.protonum) return -EINVAL; ct = ctnetlink_create_conntrack(info->net, &zone, cda, &otuple, &rtuple, u3); if (IS_ERR(ct)) return PTR_ERR(ct); err = 0; if (test_bit(IPS_EXPECTED_BIT, &ct->status)) events = 1 << IPCT_RELATED; else events = 1 << IPCT_NEW; if (cda[CTA_LABELS] && ctnetlink_attach_labels(ct, cda) == 0) events |= (1 << IPCT_LABEL); nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | (1 << IPCT_MARK) | (1 << IPCT_SYNPROXY) | events, ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_put(ct); } return err; } /* implicit 'else' */ err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) { err = ctnetlink_change_conntrack(ct, cda); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | (1 << IPCT_LABEL) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | (1 << IPCT_MARK) | (1 << IPCT_SYNPROXY), ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } } nf_ct_put(ct); return err; } static int ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, __u16 cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS_CPU); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, htons(cpu)); if (!nlh) goto nlmsg_failure; if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || nla_put_be32(skb, CTA_STATS_INSERT_FAILED, htonl(st->insert_failed)) || nla_put_be32(skb, CTA_STATS_DROP, htonl(st->drop)) || nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) || nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) || nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, htonl(st->search_restart)) || nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, htonl(st->clash_resolve)) || nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG, htonl(st->chaintoolong))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nla_put_failure: nlmsg_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) { int cpu; struct net *net = sock_net(skb->sk); if (cb->args[0] == nr_cpu_ids) return 0; for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { const struct ip_conntrack_stat *st; if (!cpu_possible(cpu)) continue; st = per_cpu_ptr(net->ct.stat, cpu); if (ctnetlink_ct_stat_cpu_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cpu, st) < 0) break; } cb->args[0] = cpu; return skb->len; } static int ctnetlink_stat_ct_cpu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_ct_stat_cpu_dump, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; } static int ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct net *net) { unsigned int flags = portid ? NLM_F_MULTI : 0, event; unsigned int nr_conntracks; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; nr_conntracks = nf_conntrack_count(net); if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nla_put_failure: nlmsg_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { struct sk_buff *skb2; int err; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) return -ENOMEM; err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), sock_net(skb->sk)); if (err <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, [CTA_EXPECT_ID] = { .type = NLA_U32 }, [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, }; static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, struct nf_conntrack_helper *helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT static size_t ctnetlink_glue_build_size(const struct nf_conn *ct) { return 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + nla_total_size(0) /* CTA_PROTOINFO */ + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) + ctnetlink_acct_size(ct) + ctnetlink_timestamp_size(ct) #if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ #endif #ifdef CONFIG_NF_CONNTRACK_MARK + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) ; } static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) { const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_ORIG) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, NF_CT_ZONE_DIR_REPL) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_timeout(skb, ct, false) < 0) goto nla_put_failure; if (ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; if (ctnetlink_dump_acct(skb, ct, IPCTNL_MSG_CT_GET) < 0 || ctnetlink_dump_timestamp(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_helpinfo(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_SECMARK if (ct->secmark && ctnetlink_dump_secctx(skb, ct) < 0) goto nla_put_failure; #endif if (ct->master && ctnetlink_dump_master(skb, ct) < 0) goto nla_put_failure; if ((ct->status & IPS_SEQ_ADJUST) && ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; if (ctnetlink_dump_ct_synproxy(skb, ct) < 0) goto nla_put_failure; #ifdef CONFIG_NF_CONNTRACK_MARK if (ctnetlink_dump_mark(skb, ct, true) < 0) goto nla_put_failure; #endif if (ctnetlink_dump_labels(skb, ct) < 0) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static int ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u_int16_t ct_attr, u_int16_t ct_info_attr) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, ct_attr); if (!nest_parms) goto nla_put_failure; if (__ctnetlink_glue_build(skb, ct) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); if (nla_put_be32(skb, ct_info_attr, htonl(ctinfo))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static int ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[]) { unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS])); unsigned long d = ct->status ^ status; if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) /* SEEN_REPLY bit can only be set */ return -EBUSY; if (d & IPS_ASSURED && !(status & IPS_ASSURED)) /* ASSURED bit can only be set */ return -EBUSY; /* This check is less strict than ctnetlink_change_status() * because callers often flip IPS_EXPECTED bits when sending * an NFQA_CT attribute to the kernel. So ignore the * unchangeable bits but do not error out. Also user programs * are allowed to clear the bits that they are allowed to change. */ __nf_ct_change_status(ct, status, ~status); return 0; } static int ctnetlink_glue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct) { int err; if (cda[CTA_TIMEOUT]) { err = ctnetlink_change_timeout(ct, cda); if (err < 0) return err; } if (cda[CTA_STATUS]) { err = ctnetlink_update_status(ct, cda); if (err < 0) return err; } if (cda[CTA_HELP]) { err = ctnetlink_change_helper(ct, cda); if (err < 0) return err; } if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) return err; } #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) { ctnetlink_change_mark(ct, cda); } #endif return 0; } static int ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) { struct nlattr *cda[CTA_MAX+1]; int ret; ret = nla_parse_nested_deprecated(cda, CTA_MAX, attr, ct_nla_policy, NULL); if (ret < 0) return ret; return ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); } static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda, const struct nf_conn *ct, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { int err; err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, nf_ct_l3num(ct), NULL); if (err < 0) return err; return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, nf_ct_l3num(ct), NULL); } static int ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report) { struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; struct nf_conntrack_helper *helper = NULL; struct nf_conntrack_expect *exp; int err; err = nla_parse_nested_deprecated(cda, CTA_EXPECT_MAX, attr, exp_nla_policy, NULL); if (err < 0) return err; err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda, ct, &tuple, &mask); if (err < 0) return err; if (cda[CTA_EXPECT_HELP_NAME]) { const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) return -EOPNOTSUPP; } exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, helper, &tuple, &mask); if (IS_ERR(exp)) return PTR_ERR(exp); err = nf_ct_expect_related_report(exp, portid, report, 0); nf_ct_expect_put(exp); return err; } static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, int diff) { if (!(ct->status & IPS_NAT_MASK)) return; nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff); } static const struct nfnl_ct_hook ctnetlink_glue_hook = { .build_size = ctnetlink_glue_build_size, .build = ctnetlink_glue_build, .parse = ctnetlink_glue_parse, .attach_expect = ctnetlink_glue_attach_expect, .seq_adjust = ctnetlink_glue_seqadj, }; #endif /* CONFIG_NETFILTER_NETLINK_GLUE_CT */ /*********************************************************************** * EXPECT ***********************************************************************/ static int ctnetlink_exp_dump_tuple(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, u32 type) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, type); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, tuple) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int ctnetlink_exp_dump_mask(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple_mask *mask) { const struct nf_conntrack_l4proto *l4proto; struct nf_conntrack_tuple m; struct nlattr *nest_parms; int ret; memset(&m, 0xFF, sizeof(m)); memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3)); m.src.u.all = mask->src.u.all; m.src.l3num = tuple->src.l3num; m.dst.protonum = tuple->dst.protonum; nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK); if (!nest_parms) goto nla_put_failure; rcu_read_lock(); ret = ctnetlink_dump_tuples_ip(skb, &m); if (ret >= 0) { l4proto = nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); } rcu_read_unlock(); if (unlikely(ret < 0)) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } #if IS_ENABLED(CONFIG_NF_NAT) static const union nf_inet_addr any_addr; #endif static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) { static siphash_aligned_key_t exp_id_seed; unsigned long a, b, c, d; net_get_random_once(&exp_id_seed, sizeof(exp_id_seed)); a = (unsigned long)exp; b = (unsigned long)exp->helper; c = (unsigned long)exp->master; d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed); #ifdef CONFIG_64BIT return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed); #else return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed); #endif } static int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) { struct nf_conn *master = exp->master; long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ; struct nf_conn_help *help; #if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *nest_parms; struct nf_conntrack_tuple nat_tuple = {}; #endif struct nf_ct_helper_expectfn *expfn; if (timeout < 0) timeout = 0; if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) goto nla_put_failure; if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0) goto nla_put_failure; if (ctnetlink_exp_dump_tuple(skb, &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, CTA_EXPECT_MASTER) < 0) goto nla_put_failure; #if IS_ENABLED(CONFIG_NF_NAT) if (!nf_inet_addr_cmp(&exp->saved_addr, &any_addr) || exp->saved_proto.all) { nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT); if (!nest_parms) goto nla_put_failure; if (nla_put_be32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir))) goto nla_put_failure; nat_tuple.src.l3num = nf_ct_l3num(master); nat_tuple.src.u3 = exp->saved_addr; nat_tuple.dst.protonum = nf_ct_protonum(master); nat_tuple.src.u = exp->saved_proto; if (ctnetlink_exp_dump_tuple(skb, &nat_tuple, CTA_EXPECT_NAT_TUPLE) < 0) goto nla_put_failure; nla_nest_end(skb, nest_parms); } #endif if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) || nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) goto nla_put_failure; help = nfct_help(master); if (help) { struct nf_conntrack_helper *helper; helper = rcu_dereference(help->helper); if (helper && nla_put_string(skb, CTA_EXPECT_HELP_NAME, helper->name)) goto nla_put_failure; } expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn); if (expfn != NULL && nla_put_string(skb, CTA_EXPECT_FN, expfn->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct nf_conntrack_expect *exp) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, exp->tuple.src.l3num, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item) { struct nf_conntrack_expect *exp = item->exp; struct net *net = nf_ct_exp_net(exp); struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int type, group; int flags = 0; if (events & (1 << IPEXP_DESTROY)) { type = IPCTNL_MSG_EXP_DELETE; group = NFNLGRP_CONNTRACK_EXP_DESTROY; } else if (events & (1 << IPEXP_NEW)) { type = IPCTNL_MSG_EXP_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; group = NFNLGRP_CONNTRACK_EXP_NEW; } else return 0; if (!item->report && !nfnetlink_has_listeners(net, group)) return 0; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (skb == NULL) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type); nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, exp->tuple.src.l3num, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); nlmsg_failure: kfree_skb(skb); errout: nfnetlink_set_err(net, 0, 0, -ENOBUFS); return 0; } #endif static int ctnetlink_exp_done(struct netlink_callback *cb) { if (cb->args[1]) nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]); return 0; } static int ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; rcu_read_lock(); last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; if (!net_eq(nf_ct_net(exp->master), net)) continue; if (cb->args[1]) { if (exp != last) continue; cb->args[1] = 0; } if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { if (!refcount_inc_not_zero(&exp->use)) continue; cb->args[1] = (unsigned long)exp; goto out; } } if (cb->args[1]) { cb->args[1] = 0; goto restart; } } out: rcu_read_unlock(); if (last) nf_ct_expect_put(last); return skb->len; } static int ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nf_conn *ct = cb->data; struct nf_conn_help *help = nfct_help(ct); u_int8_t l3proto = nfmsg->nfgen_family; if (cb->args[0]) return 0; rcu_read_lock(); last = (struct nf_conntrack_expect *)cb->args[1]; restart: hlist_for_each_entry_rcu(exp, &help->expectations, lnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; if (cb->args[1]) { if (exp != last) continue; cb->args[1] = 0; } if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { if (!refcount_inc_not_zero(&exp->use)) continue; cb->args[1] = (unsigned long)exp; goto out; } } if (cb->args[1]) { cb->args[1] = 0; goto restart; } cb->args[0] = 1; out: rcu_read_unlock(); if (last) nf_ct_expect_put(last); return skb->len; } static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[], struct netlink_ext_ack *extack) { int err; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, .done = ctnetlink_exp_done, }; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3, NULL); if (err < 0) return err; err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); /* No expectation linked to this connection tracking. */ if (!nfct_help(ct)) { nf_ct_put(ct); return 0; } c.data = ct; err = netlink_dump_start(ctnl, skb, nlh, &c); nf_ct_put(ct); return err; } static int ctnetlink_get_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; struct sk_buff *skb2; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { if (cda[CTA_EXPECT_MASTER]) return ctnetlink_dump_exp_ct(info->net, info->sk, skb, info->nlh, cda, info->extack); else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, .done = ctnetlink_exp_done, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } } err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; if (cda[CTA_EXPECT_TUPLE]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); else if (cda[CTA_EXPECT_MASTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3, NULL); else return -EINVAL; if (err < 0) return err; exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) return -ENOENT; if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); return -ENOENT; } } skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb2) { nf_ct_expect_put(exp); return -ENOMEM; } rcu_read_lock(); err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); rcu_read_unlock(); nf_ct_expect_put(exp); if (err <= 0) { kfree_skb(skb2); return -ENOMEM; } return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) { struct nf_conntrack_helper *helper; const struct nf_conn_help *m_help; const char *name = data; m_help = nfct_help(exp->master); helper = rcu_dereference(m_help->helper); if (!helper) return false; return strcmp(helper->name, name) == 0; } static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data) { return true; } static int ctnetlink_del_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; int err; if (cda[CTA_EXPECT_TUPLE]) { /* delete a single expect by tuple */ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); if (err < 0) return err; /* bump usage count to 2 */ exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) return -ENOENT; if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); if (ntohl(id) != (u32)(unsigned long)exp) { nf_ct_expect_put(exp); return -ENOENT; } } /* after list removal, usage count == 1 */ spin_lock_bh(&nf_conntrack_expect_lock); if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_expect_put(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); /* have to put what we 'get' above. * after this line usage count == 0 */ nf_ct_expect_put(exp); } else if (cda[CTA_EXPECT_HELP_NAME]) { char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]); nf_ct_expect_iterate_net(info->net, expect_iter_name, name, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } else { /* This basically means we have to flush everything*/ nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } return 0; } static int ctnetlink_change_expect(struct nf_conntrack_expect *x, const struct nlattr * const cda[]) { if (cda[CTA_EXPECT_TIMEOUT]) { if (!del_timer(&x->timeout)) return -ETIME; x->timeout.expires = jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; add_timer(&x->timeout); } return 0; } #if IS_ENABLED(CONFIG_NF_NAT) static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 }, [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, }; #endif static int ctnetlink_parse_expect_nat(const struct nlattr *attr, struct nf_conntrack_expect *exp, u_int8_t u3) { #if IS_ENABLED(CONFIG_NF_NAT) struct nlattr *tb[CTA_EXPECT_NAT_MAX+1]; struct nf_conntrack_tuple nat_tuple = {}; int err; err = nla_parse_nested_deprecated(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy, NULL); if (err < 0) return err; if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE]) return -EINVAL; err = ctnetlink_parse_tuple((const struct nlattr * const *)tb, &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3, NULL); if (err < 0) return err; exp->saved_addr = nat_tuple.src.u3; exp->saved_proto = nat_tuple.src.u; exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR])); return 0; #else return -EOPNOTSUPP; #endif } static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, struct nf_conntrack_helper *helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { u_int32_t class = 0; struct nf_conntrack_expect *exp; struct nf_conn_help *help; int err; help = nfct_help(ct); if (!help) return ERR_PTR(-EOPNOTSUPP); if (cda[CTA_EXPECT_CLASS] && helper) { class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS])); if (class > helper->expect_class_max) return ERR_PTR(-EINVAL); } exp = nf_ct_expect_alloc(ct); if (!exp) return ERR_PTR(-ENOMEM); if (cda[CTA_EXPECT_FLAGS]) { exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); exp->flags &= ~NF_CT_EXPECT_USERSPACE; } else { exp->flags = 0; } if (cda[CTA_EXPECT_FN]) { const char *name = nla_data(cda[CTA_EXPECT_FN]); struct nf_ct_helper_expectfn *expfn; expfn = nf_ct_helper_expectfn_find_by_name(name); if (expfn == NULL) { err = -EINVAL; goto err_out; } exp->expectfn = expfn->expectfn; } else exp->expectfn = NULL; exp->class = class; exp->master = ct; exp->helper = helper; exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; exp->mask.src.u.all = mask->src.u.all; if (cda[CTA_EXPECT_NAT]) { err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT], exp, nf_ct_l3num(ct)); if (err < 0) goto err_out; } return exp; err_out: nf_ct_expect_put(exp); return ERR_PTR(err); } static int ctnetlink_create_expect(struct net *net, const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], u_int8_t u3, u32 portid, int report) { struct nf_conntrack_tuple tuple, mask, master_tuple; struct nf_conntrack_tuple_hash *h = NULL; struct nf_conntrack_helper *helper = NULL; struct nf_conntrack_expect *exp; struct nf_conn *ct; int err; /* caller guarantees that those three CTA_EXPECT_* exist */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3, NULL); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3, NULL); if (err < 0) return err; /* Look for master conntrack of this expectation */ h = nf_conntrack_find_get(net, zone, &master_tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); rcu_read_lock(); if (cda[CTA_EXPECT_HELP_NAME]) { const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); helper = __nf_conntrack_helper_find(helpname, u3, nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { err = -EOPNOTSUPP; goto err_ct; } rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, u3, nf_ct_protonum(ct)); if (helper) { err = -EAGAIN; goto err_rcu; } rcu_read_unlock(); #endif err = -EOPNOTSUPP; goto err_ct; } } exp = ctnetlink_alloc_expect(cda, ct, helper, &tuple, &mask); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto err_rcu; } err = nf_ct_expect_related_report(exp, portid, report, 0); nf_ct_expect_put(exp); err_rcu: rcu_read_unlock(); err_ct: nf_ct_put(ct); return err; } static int ctnetlink_new_expect(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { u_int8_t u3 = info->nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; struct nf_conntrack_zone zone; int err; if (!cda[CTA_EXPECT_TUPLE] || !cda[CTA_EXPECT_MASK] || !cda[CTA_EXPECT_MASTER]) return -EINVAL; err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3, NULL); if (err < 0) return err; spin_lock_bh(&nf_conntrack_expect_lock); exp = __nf_ct_expect_find(info->net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; if (info->nlh->nlmsg_flags & NLM_F_CREATE) { err = ctnetlink_create_expect(info->net, &zone, cda, u3, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); } return err; } err = -EEXIST; if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) err = ctnetlink_change_expect(exp, cda); spin_unlock_bh(&nf_conntrack_expect_lock); return err; } static int ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_EXP_GET_STATS_CPU); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, htons(cpu)); if (!nlh) goto nlmsg_failure; if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) || nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) || nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nla_put_failure: nlmsg_failure: nlmsg_cancel(skb, nlh); return -1; } static int ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) { int cpu; struct net *net = sock_net(skb->sk); if (cb->args[0] == nr_cpu_ids) return 0; for (cpu = cb->args[0]; cpu < nr_cpu_ids; cpu++) { const struct ip_conntrack_stat *st; if (!cpu_possible(cpu)) continue; st = per_cpu_ptr(net->ct.stat, cpu); if (ctnetlink_exp_stat_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cpu, st) < 0) break; } cb->args[0] = cpu; return skb->len; } static int ctnetlink_stat_exp_cpu(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_exp_stat_cpu_dump, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; } #ifdef CONFIG_NF_CONNTRACK_EVENTS static struct nf_ct_event_notifier ctnl_notifier = { .ct_event = ctnetlink_conntrack_event, .exp_event = ctnetlink_expect_event, }; #endif static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, .type = NFNL_CB_MUTEX, .attr_count = CTA_MAX, .policy = ct_nla_policy }, [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu, .type = NFNL_CB_MUTEX, }, [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct, .type = NFNL_CB_MUTEX, }, [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying, .type = NFNL_CB_MUTEX, }, [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed, .type = NFNL_CB_MUTEX, }, }; static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, .type = NFNL_CB_MUTEX, .attr_count = CTA_EXPECT_MAX, .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, .type = NFNL_CB_MUTEX, .attr_count = CTA_EXPECT_MAX, .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, .type = NFNL_CB_MUTEX, .attr_count = CTA_EXPECT_MAX, .policy = exp_nla_policy }, [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu, .type = NFNL_CB_MUTEX, }, }; static const struct nfnetlink_subsystem ctnl_subsys = { .name = "conntrack", .subsys_id = NFNL_SUBSYS_CTNETLINK, .cb_count = IPCTNL_MSG_MAX, .cb = ctnl_cb, }; static const struct nfnetlink_subsystem ctnl_exp_subsys = { .name = "conntrack_expect", .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, .cb_count = IPCTNL_MSG_EXP_MAX, .cb = ctnl_exp_cb, }; MODULE_ALIAS("ip_conntrack_netlink"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); static int __net_init ctnetlink_net_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS nf_conntrack_register_notifier(net, &ctnl_notifier); #endif return 0; } static void ctnetlink_net_pre_exit(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS nf_conntrack_unregister_notifier(net); #endif } static struct pernet_operations ctnetlink_net_ops = { .init = ctnetlink_net_init, .pre_exit = ctnetlink_net_pre_exit, }; static int __init ctnetlink_init(void) { int ret; NL_ASSERT_DUMP_CTX_FITS(struct ctnetlink_list_dump_ctx); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { pr_err("ctnetlink_init: cannot register with nfnetlink.\n"); goto err_out; } ret = nfnetlink_subsys_register(&ctnl_exp_subsys); if (ret < 0) { pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n"); goto err_unreg_subsys; } ret = register_pernet_subsys(&ctnetlink_net_ops); if (ret < 0) { pr_err("ctnetlink_init: cannot register pernet operations\n"); goto err_unreg_exp_subsys; } #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT /* setup interaction between nf_queue and nf_conntrack_netlink. */ RCU_INIT_POINTER(nfnl_ct_hook, &ctnetlink_glue_hook); #endif return 0; err_unreg_exp_subsys: nfnetlink_subsys_unregister(&ctnl_exp_subsys); err_unreg_subsys: nfnetlink_subsys_unregister(&ctnl_subsys); err_out: return ret; } static void __exit ctnetlink_exit(void) { unregister_pernet_subsys(&ctnetlink_net_ops); nfnetlink_subsys_unregister(&ctnl_exp_subsys); nfnetlink_subsys_unregister(&ctnl_subsys); #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT RCU_INIT_POINTER(nfnl_ct_hook, NULL); #endif synchronize_rcu(); } module_init(ctnetlink_init); module_exit(ctnetlink_exit);
2 2 2 2 2 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_VMX_NESTED_H #define __KVM_X86_VMX_NESTED_H #include "kvm_cache_regs.h" #include "hyperv.h" #include "vmcs12.h" #include "vmx.h" /* * Status returned by nested_vmx_enter_non_root_mode(): */ enum nvmx_vmentry_status { NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */ NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */ NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */ NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */ }; void vmx_leave_nested(struct kvm_vcpu *vcpu); void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps); void nested_vmx_hardware_unsetup(void); __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); void nested_vmx_set_vmcs_shadowing_bitmap(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu); void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, u32 exit_intr_info, unsigned long exit_qualification); void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.cached_vmcs12; } static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.cached_shadow_vmcs12; } /* * Note: the same condition is checked against the state provided by userspace * in vmx_set_nested_state; if it is satisfied, the nested state must include * the VMCS12. */ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ return vmx->nested.current_vmptr != -1ull || nested_vmx_is_evmptr12_set(vmx); } static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; } static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu) { /* return the page table to be shadowed - in our case, EPT12 */ return get_vmcs12(vcpu)->ept_pointer; } static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) { return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT; } /* * Return the cr0/4 value that a nested guest would read. This is a combination * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as * the value+mask loaded into vmcs02 may not match the vmcs12 fields. */ static inline unsigned long nested_read_cr0(struct vmcs12 *fields) { return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | (fields->cr0_read_shadow & fields->cr0_guest_host_mask); } static inline unsigned long nested_read_cr4(struct vmcs12 *fields) { return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | (fields->cr4_read_shadow & fields->cr4_guest_host_mask); } static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) { return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low); } /* * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE * to modify any valid field of the VMCS, or are the VM-exit * information fields read-only? */ static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.msrs.misc_low & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; } static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS; } static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.msrs.procbased_ctls_high & CPU_BASED_MONITOR_TRAP_FLAG; } static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_SHADOW_VMCS; } static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; } static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) { return (vmcs12->cpu_based_vm_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && (vmcs12->secondary_vm_exec_control & bit); } static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER; } static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING; } static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; } static inline int nested_cpu_has_mtf(struct vmcs12 *vmcs12) { return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); } static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); } static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); } static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML); } static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); } static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID); } static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); } static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) { return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; } static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); } static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) { return nested_cpu_has_vmfunc(vmcs12) && (vmcs12->vm_function_control & VMX_VMFUNC_EPTP_SWITCHING); } static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); } static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12) { return vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER; } static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) { return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); } /* * In nested virtualization, check if L1 asked to exit on external interrupts. * For most existing hypervisors, this will always return true. */ static inline bool nested_exit_on_intr(struct kvm_vcpu *vcpu) { return get_vmcs12(vcpu)->pin_based_vm_exec_control & PIN_BASED_EXT_INTR_MASK; } static inline bool nested_cpu_has_encls_exit(struct vmcs12 *vmcs12) { return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING); } /* * if fixed0[i] == 1: val[i] must be 1 * if fixed1[i] == 0: val[i] must be 0 */ static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) { return ((val & fixed1) | fixed0) == val; } static inline bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_UNRESTRICTED_GUEST && nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); return fixed_bits_valid(val, fixed0, fixed1); } static inline bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; return fixed_bits_valid(val, fixed0, fixed1); } static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; return fixed_bits_valid(val, fixed0, fixed1) && __kvm_is_valid_cr4(vcpu, val); } /* No difference in the restrictions on guest and host CR4 in VMX operation. */ #define nested_guest_cr4_valid nested_cr4_valid #define nested_host_cr4_valid nested_cr4_valid extern struct kvm_x86_nested_ops vmx_nested_ops; #endif /* __KVM_X86_VMX_NESTED_H */
35 297 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/truncate.h * * Common inline functions needed for truncate support */ /* * Truncate blocks that were not used by write. We have to truncate the * pagecache as well so that corresponding buffers get properly unmapped. */ static inline void ext4_truncate_failed_write(struct inode *inode) { struct address_space *mapping = inode->i_mapping; /* * We don't need to call ext4_break_layouts() because the blocks we * are truncating were never visible to userspace. */ filemap_invalidate_lock(mapping); truncate_inode_pages(mapping, inode->i_size); ext4_truncate(inode); filemap_invalidate_unlock(mapping); } /* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) { ext4_lblk_t needed; needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); /* Give ourselves just enough room to cope with inodes in which * i_blocks is corrupt: we've seen disk corruptions in the past * which resulted in random data in an inode which looked enough * like a regular file for ext4 to try to delete it. Things * will go a bit crazy if that happens, but at least we should * try not to panic the whole kernel. */ if (needed < 2) needed = 2; /* But we need to bound the transaction so we don't overflow the * journal. */ if (needed > EXT4_MAX_TRANS_DATA) needed = EXT4_MAX_TRANS_DATA; return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; }
10 10 10 10 8 2 10 10 8 2 10 10 10 10 10 10 2 2 2 10 10 10 10 10 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017 Facebook */ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/debugfs.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" static int queue_poll_stat_show(void *data, struct seq_file *m) { return 0; } static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos) __acquires(&q->requeue_lock) { struct request_queue *q = m->private; spin_lock_irq(&q->requeue_lock); return seq_list_start(&q->requeue_list, *pos); } static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos) { struct request_queue *q = m->private; return seq_list_next(v, &q->requeue_list, pos); } static void queue_requeue_list_stop(struct seq_file *m, void *v) __releases(&q->requeue_lock) { struct request_queue *q = m->private; spin_unlock_irq(&q->requeue_lock); } static const struct seq_operations queue_requeue_list_seq_ops = { .start = queue_requeue_list_start, .next = queue_requeue_list_next, .stop = queue_requeue_list_stop, .show = blk_mq_debugfs_rq_show, }; static int blk_flags_show(struct seq_file *m, const unsigned long flags, const char *const *flag_name, int flag_name_count) { bool sep = false; int i; for (i = 0; i < sizeof(flags) * BITS_PER_BYTE; i++) { if (!(flags & BIT(i))) continue; if (sep) seq_puts(m, "|"); sep = true; if (i < flag_name_count && flag_name[i]) seq_puts(m, flag_name[i]); else seq_printf(m, "%d", i); } return 0; } static int queue_pm_only_show(void *data, struct seq_file *m) { struct request_queue *q = data; seq_printf(m, "%d\n", atomic_read(&q->pm_only)); return 0; } #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(STOPPED), QUEUE_FLAG_NAME(DYING), QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), QUEUE_FLAG_NAME(SYNCHRONOUS), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(STABLE_WRITES), QUEUE_FLAG_NAME(POLL), QUEUE_FLAG_NAME(WC), QUEUE_FLAG_NAME(FUA), QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(PCI_P2PDMA), QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(NOWAIT), QUEUE_FLAG_NAME(SQ_SCHED), QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE), }; #undef QUEUE_FLAG_NAME static int queue_state_show(void *data, struct seq_file *m) { struct request_queue *q = data; blk_flags_show(m, q->queue_flags, blk_queue_flag_name, ARRAY_SIZE(blk_queue_flag_name)); seq_puts(m, "\n"); return 0; } static ssize_t queue_state_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct request_queue *q = data; char opbuf[16] = { }, *op; /* * The "state" attribute is removed when the queue is removed. Don't * allow setting the state on a dying queue to avoid a use-after-free. */ if (blk_queue_dying(q)) return -ENOENT; if (count >= sizeof(opbuf)) { pr_err("%s: operation too long\n", __func__); goto inval; } if (copy_from_user(opbuf, buf, count)) return -EFAULT; op = strstrip(opbuf); if (strcmp(op, "run") == 0) { blk_mq_run_hw_queues(q, true); } else if (strcmp(op, "start") == 0) { blk_mq_start_stopped_hw_queues(q, true); } else if (strcmp(op, "kick") == 0) { blk_mq_kick_requeue_list(q); } else { pr_err("%s: unsupported operation '%s'\n", __func__, op); inval: pr_err("%s: use 'run', 'start' or 'kick'\n", __func__); return -EINVAL; } return count; } static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, { "pm_only", 0600, queue_pm_only_show, NULL }, { "state", 0600, queue_state_show, queue_state_write }, { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, { }, }; #define HCTX_STATE_NAME(name) [BLK_MQ_S_##name] = #name static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), HCTX_STATE_NAME(INACTIVE), }; #undef HCTX_STATE_NAME static int hctx_state_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; blk_flags_show(m, hctx->state, hctx_state_name, ARRAY_SIZE(hctx_state_name)); seq_puts(m, "\n"); return 0; } #define BLK_TAG_ALLOC_NAME(name) [BLK_TAG_ALLOC_##name] = #name static const char *const alloc_policy_name[] = { BLK_TAG_ALLOC_NAME(FIFO), BLK_TAG_ALLOC_NAME(RR), }; #undef BLK_TAG_ALLOC_NAME #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(SHOULD_MERGE), HCTX_FLAG_NAME(TAG_QUEUE_SHARED), HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), }; #undef HCTX_FLAG_NAME static int hctx_flags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags); seq_puts(m, "alloc_policy="); if (alloc_policy < ARRAY_SIZE(alloc_policy_name) && alloc_policy_name[alloc_policy]) seq_puts(m, alloc_policy_name[alloc_policy]); else seq_printf(m, "%d", alloc_policy); seq_puts(m, " "); blk_flags_show(m, hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy), hctx_flag_name, ARRAY_SIZE(hctx_flag_name)); seq_puts(m, "\n"); return 0; } #define CMD_FLAG_NAME(name) [__REQ_##name] = #name static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(FAILFAST_DEV), CMD_FLAG_NAME(FAILFAST_TRANSPORT), CMD_FLAG_NAME(FAILFAST_DRIVER), CMD_FLAG_NAME(SYNC), CMD_FLAG_NAME(META), CMD_FLAG_NAME(PRIO), CMD_FLAG_NAME(NOMERGE), CMD_FLAG_NAME(IDLE), CMD_FLAG_NAME(INTEGRITY), CMD_FLAG_NAME(FUA), CMD_FLAG_NAME(PREFLUSH), CMD_FLAG_NAME(RAHEAD), CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(NOWAIT), CMD_FLAG_NAME(NOUNMAP), CMD_FLAG_NAME(POLLED), }; #undef CMD_FLAG_NAME #define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name static const char *const rqf_name[] = { RQF_NAME(STARTED), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), RQF_NAME(DONTPREP), RQF_NAME(SCHED_TAGS), RQF_NAME(USE_SCHED), RQF_NAME(FAILED), RQF_NAME(QUIET), RQF_NAME(IO_STAT), RQF_NAME(PM), RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), RQF_NAME(TIMED_OUT), RQF_NAME(RESV), }; #undef RQF_NAME static const char *const blk_mq_rq_state_name_array[] = { [MQ_RQ_IDLE] = "idle", [MQ_RQ_IN_FLIGHT] = "in_flight", [MQ_RQ_COMPLETE] = "complete", }; static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) { if (WARN_ON_ONCE((unsigned int)rq_state >= ARRAY_SIZE(blk_mq_rq_state_name_array))) return "(?)"; return blk_mq_rq_state_name_array[rq_state]; } int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const enum req_op op = req_op(rq); const char *op_str = blk_op_str(op); seq_printf(m, "%p {.op=", rq); if (strcmp(op_str, "UNKNOWN") == 0) seq_printf(m, "%u", op); else seq_printf(m, "%s", op_str); seq_puts(m, ", .cmd_flags="); blk_flags_show(m, (__force unsigned int)(rq->cmd_flags & ~REQ_OP_MASK), cmd_flag_name, ARRAY_SIZE(cmd_flag_name)); seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); if (mq_ops->show_rq) mq_ops->show_rq(m, rq); seq_puts(m, "}\n"); return 0; } EXPORT_SYMBOL_GPL(__blk_mq_debugfs_rq_show); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) { return __blk_mq_debugfs_rq_show(m, list_entry_rq(v)); } EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show); static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&hctx->lock) { struct blk_mq_hw_ctx *hctx = m->private; spin_lock(&hctx->lock); return seq_list_start(&hctx->dispatch, *pos); } static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos) { struct blk_mq_hw_ctx *hctx = m->private; return seq_list_next(v, &hctx->dispatch, pos); } static void hctx_dispatch_stop(struct seq_file *m, void *v) __releases(&hctx->lock) { struct blk_mq_hw_ctx *hctx = m->private; spin_unlock(&hctx->lock); } static const struct seq_operations hctx_dispatch_seq_ops = { .start = hctx_dispatch_start, .next = hctx_dispatch_next, .stop = hctx_dispatch_stop, .show = blk_mq_debugfs_rq_show, }; struct show_busy_params { struct seq_file *m; struct blk_mq_hw_ctx *hctx; }; /* * Note: the state of a request may change while this function is in progress, * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to * keep iterating requests. */ static bool hctx_show_busy_rq(struct request *rq, void *data) { const struct show_busy_params *params = data; if (rq->mq_hctx == params->hctx) __blk_mq_debugfs_rq_show(params->m, rq); return true; } static int hctx_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct show_busy_params params = { .m = m, .hctx = hctx }; blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, &params); return 0; } static const char *const hctx_types[] = { [HCTX_TYPE_DEFAULT] = "default", [HCTX_TYPE_READ] = "read", [HCTX_TYPE_POLL] = "poll", }; static int hctx_type_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES); seq_printf(m, "%s\n", hctx_types[hctx->type]); return 0; } static int hctx_ctx_map_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; sbitmap_bitmap_show(&hctx->ctx_map, m); return 0; } static void blk_mq_debugfs_tags_show(struct seq_file *m, struct blk_mq_tags *tags) { seq_printf(m, "nr_tags=%u\n", tags->nr_tags); seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); seq_printf(m, "active_queues=%d\n", READ_ONCE(tags->active_queues)); seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(&tags->bitmap_tags, m); if (tags->nr_reserved_tags) { seq_puts(m, "\nbreserved_tags:\n"); sbitmap_queue_show(&tags->breserved_tags, m); } } static int hctx_tags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->sysfs_lock); if (res) goto out; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); mutex_unlock(&q->sysfs_lock); out: return res; } static int hctx_tags_bitmap_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->sysfs_lock); if (res) goto out; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); mutex_unlock(&q->sysfs_lock); out: return res; } static int hctx_sched_tags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->sysfs_lock); if (res) goto out; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); mutex_unlock(&q->sysfs_lock); out: return res; } static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->sysfs_lock); if (res) goto out; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); mutex_unlock(&q->sysfs_lock); out: return res; } static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "%d\n", __blk_mq_active_requests(hctx)); return 0; } static int hctx_dispatch_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "%u\n", hctx->dispatch_busy); return 0; } #define CTX_RQ_SEQ_OPS(name, type) \ static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ __acquires(&ctx->lock) \ { \ struct blk_mq_ctx *ctx = m->private; \ \ spin_lock(&ctx->lock); \ return seq_list_start(&ctx->rq_lists[type], *pos); \ } \ \ static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct blk_mq_ctx *ctx = m->private; \ \ return seq_list_next(v, &ctx->rq_lists[type], pos); \ } \ \ static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \ __releases(&ctx->lock) \ { \ struct blk_mq_ctx *ctx = m->private; \ \ spin_unlock(&ctx->lock); \ } \ \ static const struct seq_operations ctx_##name##_rq_list_seq_ops = { \ .start = ctx_##name##_rq_list_start, \ .next = ctx_##name##_rq_list_next, \ .stop = ctx_##name##_rq_list_stop, \ .show = blk_mq_debugfs_rq_show, \ } CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); static int blk_mq_debugfs_show(struct seq_file *m, void *v) { const struct blk_mq_debugfs_attr *attr = m->private; void *data = d_inode(m->file->f_path.dentry->d_parent)->i_private; return attr->show(data, m); } static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *m = file->private_data; const struct blk_mq_debugfs_attr *attr = m->private; void *data = d_inode(file->f_path.dentry->d_parent)->i_private; /* * Attributes that only implement .seq_ops are read-only and 'attr' is * the same with 'data' in this case. */ if (attr == data || !attr->write) return -EPERM; return attr->write(data, buf, count, ppos); } static int blk_mq_debugfs_open(struct inode *inode, struct file *file) { const struct blk_mq_debugfs_attr *attr = inode->i_private; void *data = d_inode(file->f_path.dentry->d_parent)->i_private; struct seq_file *m; int ret; if (attr->seq_ops) { ret = seq_open(file, attr->seq_ops); if (!ret) { m = file->private_data; m->private = data; } return ret; } if (WARN_ON_ONCE(!attr->show)) return -EPERM; return single_open(file, blk_mq_debugfs_show, inode->i_private); } static int blk_mq_debugfs_release(struct inode *inode, struct file *file) { const struct blk_mq_debugfs_attr *attr = inode->i_private; if (attr->show) return single_release(inode, file); return seq_release(inode, file); } static const struct file_operations blk_mq_debugfs_fops = { .open = blk_mq_debugfs_open, .read = seq_read, .write = blk_mq_debugfs_write, .llseek = seq_lseek, .release = blk_mq_debugfs_release, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, hctx_state_show}, {"flags", 0400, hctx_flags_show}, {"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops}, {"busy", 0400, hctx_busy_show}, {"ctx_map", 0400, hctx_ctx_map_show}, {"tags", 0400, hctx_tags_show}, {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, {}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, {}, }; static void debugfs_create_files(struct dentry *parent, void *data, const struct blk_mq_debugfs_attr *attr) { if (IS_ERR_OR_NULL(parent)) return; d_inode(parent)->i_private = data; for (; attr->name; attr++) debugfs_create_file(attr->name, attr->mode, parent, (void *)attr, &blk_mq_debugfs_fops); } void blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); /* * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir * didn't exist yet (because we don't know what to name the directory * until the queue is registered to a gendisk). */ if (q->elevator && !q->sched_debugfs_dir) blk_mq_debugfs_register_sched(q); /* Similarly, blk_mq_init_hctx() couldn't do this previously. */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir) blk_mq_debugfs_register_hctx(q, hctx); if (q->elevator && !hctx->sched_debugfs_dir) blk_mq_debugfs_register_sched_hctx(q, hctx); } if (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; while (rqos) { blk_mq_debugfs_register_rqos(rqos); rqos = rqos->next; } } } static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { struct dentry *ctx_dir; char name[20]; snprintf(name, sizeof(name), "cpu%u", ctx->cpu); ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir); debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs); } void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { struct blk_mq_ctx *ctx; char name[20]; int i; if (!q->debugfs_dir) return; snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs); hctx_for_each_ctx(hctx, ctx, i) blk_mq_debugfs_register_ctx(hctx, ctx); } void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { if (!hctx->queue->debugfs_dir) return; debugfs_remove_recursive(hctx->debugfs_dir); hctx->sched_debugfs_dir = NULL; hctx->debugfs_dir = NULL; } void blk_mq_debugfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_hctx(q, hctx); } void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_unregister_hctx(hctx); } void blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; lockdep_assert_held(&q->debugfs_mutex); /* * If the parent directory has not been created yet, return, we will be * called again later on and the directory/files will be created then. */ if (!q->debugfs_dir) return; if (!e->queue_debugfs_attrs) return; q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir); debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs); } void blk_mq_debugfs_unregister_sched(struct request_queue *q) { lockdep_assert_held(&q->debugfs_mutex); debugfs_remove_recursive(q->sched_debugfs_dir); q->sched_debugfs_dir = NULL; } static const char *rq_qos_id_to_name(enum rq_qos_id id) { switch (id) { case RQ_QOS_WBT: return "wbt"; case RQ_QOS_LATENCY: return "latency"; case RQ_QOS_COST: return "cost"; } return "unknown"; } void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { lockdep_assert_held(&rqos->disk->queue->debugfs_mutex); if (!rqos->disk->queue->debugfs_dir) return; debugfs_remove_recursive(rqos->debugfs_dir); rqos->debugfs_dir = NULL; } void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; const char *dir_name = rq_qos_id_to_name(rqos->id); lockdep_assert_held(&q->debugfs_mutex); if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) return; if (!q->rqos_debugfs_dir) q->rqos_debugfs_dir = debugfs_create_dir("rqos", q->debugfs_dir); rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir); debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); } void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { struct elevator_type *e = q->elevator->type; lockdep_assert_held(&q->debugfs_mutex); /* * If the parent debugfs directory has not been created yet, return; * We will be called again later on with appropriate parent debugfs * directory from blk_register_queue() */ if (!hctx->debugfs_dir) return; if (!e->hctx_debugfs_attrs) return; hctx->sched_debugfs_dir = debugfs_create_dir("sched", hctx->debugfs_dir); debugfs_create_files(hctx->sched_debugfs_dir, hctx, e->hctx_debugfs_attrs); } void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&hctx->queue->debugfs_mutex); if (!hctx->queue->debugfs_dir) return; debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; }
2 2 1 2 2 2 2 2 2 3 1 1 1 2 2 2 2 2 2 2 1 1 3 1 1 1 1 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" struct devlink_region { struct devlink *devlink; struct devlink_port *port; struct list_head list; union { const struct devlink_region_ops *ops; const struct devlink_port_region_ops *port_ops; }; struct mutex snapshot_lock; /* protects snapshot_list, * max_snapshots and cur_snapshots * consistency. */ struct list_head snapshot_list; u32 max_snapshots; u32 cur_snapshots; u64 size; }; struct devlink_snapshot { struct list_head list; struct devlink_region *region; u8 *data; u32 id; }; static struct devlink_region * devlink_region_get_by_name(struct devlink *devlink, const char *region_name) { struct devlink_region *region; list_for_each_entry(region, &devlink->region_list, list) if (!strcmp(region->ops->name, region_name)) return region; return NULL; } static struct devlink_region * devlink_port_region_get_by_name(struct devlink_port *port, const char *region_name) { struct devlink_region *region; list_for_each_entry(region, &port->region_list, list) if (!strcmp(region->ops->name, region_name)) return region; return NULL; } static struct devlink_snapshot * devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) { struct devlink_snapshot *snapshot; list_for_each_entry(snapshot, &region->snapshot_list, list) if (snapshot->id == id) return snapshot; return NULL; } static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, struct devlink *devlink, struct devlink_snapshot *snapshot) { struct nlattr *snap_attr; int err; snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT); if (!snap_attr) return -EINVAL; err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); if (err) goto nla_put_failure; nla_nest_end(msg, snap_attr); return 0; nla_put_failure: nla_nest_cancel(msg, snap_attr); return err; } static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, struct devlink *devlink, struct devlink_region *region) { struct devlink_snapshot *snapshot; struct nlattr *snapshots_attr; int err; snapshots_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOTS); if (!snapshots_attr) return -EINVAL; list_for_each_entry(snapshot, &region->snapshot_list, list) { err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot); if (err) goto nla_put_failure; } nla_nest_end(msg, snapshots_attr); return 0; nla_put_failure: nla_nest_cancel(msg, snapshots_attr); return err; } static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct devlink_region *region) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; err = devlink_nl_put_handle(msg, devlink); if (err) goto nla_put_failure; if (region->port) { err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, region->port->index); if (err) goto nla_put_failure; } err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) goto nla_put_failure; err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, region->size, DEVLINK_ATTR_PAD); if (err) goto nla_put_failure; err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS, region->max_snapshots); if (err) goto nla_put_failure; err = devlink_nl_region_snapshots_id_put(msg, devlink, region); if (err) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return err; } static struct sk_buff * devlink_nl_region_notify_build(struct devlink_region *region, struct devlink_snapshot *snapshot, enum devlink_command cmd, u32 portid, u32 seq) { struct devlink *devlink = region->devlink; struct sk_buff *msg; void *hdr; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return ERR_PTR(-ENOMEM); hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd); if (!hdr) { err = -EMSGSIZE; goto out_free_msg; } err = devlink_nl_put_handle(msg, devlink); if (err) goto out_cancel_msg; if (region->port) { err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, region->port->index); if (err) goto out_cancel_msg; } err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) goto out_cancel_msg; if (snapshot) { err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); if (err) goto out_cancel_msg; } else { err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, region->size, DEVLINK_ATTR_PAD); if (err) goto out_cancel_msg; } genlmsg_end(msg, hdr); return msg; out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); return ERR_PTR(err); } static void devlink_nl_region_notify(struct devlink_region *region, struct devlink_snapshot *snapshot, enum devlink_command cmd) { struct devlink *devlink = region->devlink; struct sk_buff *msg; WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); if (IS_ERR(msg)) return; devlink_nl_notify_send(devlink, msg); } void devlink_regions_notify_register(struct devlink *devlink) { struct devlink_region *region; list_for_each_entry(region, &devlink->region_list, list) devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); } void devlink_regions_notify_unregister(struct devlink *devlink) { struct devlink_region *region; list_for_each_entry_reverse(region, &devlink->region_list, list) devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); } /** * __devlink_snapshot_id_increment - Increment number of snapshots using an id * @devlink: devlink instance * @id: the snapshot id * * Track when a new snapshot begins using an id. Load the count for the * given id from the snapshot xarray, increment it, and store it back. * * Called when a new snapshot is created with the given id. * * The id *must* have been previously allocated by * devlink_region_snapshot_id_get(). * * Returns 0 on success, or an error on failure. */ static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id) { unsigned long count; void *p; int err; xa_lock(&devlink->snapshot_ids); p = xa_load(&devlink->snapshot_ids, id); if (WARN_ON(!p)) { err = -EINVAL; goto unlock; } if (WARN_ON(!xa_is_value(p))) { err = -EINVAL; goto unlock; } count = xa_to_value(p); count++; err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), GFP_ATOMIC)); unlock: xa_unlock(&devlink->snapshot_ids); return err; } /** * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id * @devlink: devlink instance * @id: the snapshot id * * Track when a snapshot is deleted and stops using an id. Load the count * for the given id from the snapshot xarray, decrement it, and store it * back. * * If the count reaches zero, erase this id from the xarray, freeing it * up for future re-use by devlink_region_snapshot_id_get(). * * Called when a snapshot using the given id is deleted, and when the * initial allocator of the id is finished using it. */ static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id) { unsigned long count; void *p; xa_lock(&devlink->snapshot_ids); p = xa_load(&devlink->snapshot_ids, id); if (WARN_ON(!p)) goto unlock; if (WARN_ON(!xa_is_value(p))) goto unlock; count = xa_to_value(p); if (count > 1) { count--; __xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), GFP_ATOMIC); } else { /* If this was the last user, we can erase this id */ __xa_erase(&devlink->snapshot_ids, id); } unlock: xa_unlock(&devlink->snapshot_ids); } /** * __devlink_snapshot_id_insert - Insert a specific snapshot ID * @devlink: devlink instance * @id: the snapshot id * * Mark the given snapshot id as used by inserting a zero value into the * snapshot xarray. * * This must be called while holding the devlink instance lock. Unlike * devlink_snapshot_id_get, the initial reference count is zero, not one. * It is expected that the id will immediately be used before * releasing the devlink instance lock. * * Returns zero on success, or an error code if the snapshot id could not * be inserted. */ static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) { int err; xa_lock(&devlink->snapshot_ids); if (xa_load(&devlink->snapshot_ids, id)) { xa_unlock(&devlink->snapshot_ids); return -EEXIST; } err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), GFP_ATOMIC)); xa_unlock(&devlink->snapshot_ids); return err; } /** * __devlink_region_snapshot_id_get - get snapshot ID * @devlink: devlink instance * @id: storage to return snapshot id * * Allocates a new snapshot id. Returns zero on success, or a negative * error on failure. Must be called while holding the devlink instance * lock. * * Snapshot IDs are tracked using an xarray which stores the number of * users of the snapshot id. * * Note that the caller of this function counts as a 'user', in order to * avoid race conditions. The caller must release its hold on the * snapshot by using devlink_region_snapshot_id_put. */ static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) { return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1), xa_limit_32b, GFP_KERNEL); } /** * __devlink_region_snapshot_create - create a new snapshot * This will add a new snapshot of a region. The snapshot * will be stored on the region struct and can be accessed * from devlink. This is useful for future analyses of snapshots. * Multiple snapshots can be created on a region. * The @snapshot_id should be obtained using the getter function. * * Must be called only while holding the region snapshot lock. * * @region: devlink region of the snapshot * @data: snapshot data * @snapshot_id: snapshot id to be created */ static int __devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id) { struct devlink *devlink = region->devlink; struct devlink_snapshot *snapshot; int err; lockdep_assert_held(&region->snapshot_lock); /* check if region can hold one more snapshot */ if (region->cur_snapshots == region->max_snapshots) return -ENOSPC; if (devlink_region_snapshot_get_by_id(region, snapshot_id)) return -EEXIST; snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); if (!snapshot) return -ENOMEM; err = __devlink_snapshot_id_increment(devlink, snapshot_id); if (err) goto err_snapshot_id_increment; snapshot->id = snapshot_id; snapshot->region = region; snapshot->data = data; list_add_tail(&snapshot->list, &region->snapshot_list); region->cur_snapshots++; devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); return 0; err_snapshot_id_increment: kfree(snapshot); return err; } static void devlink_region_snapshot_del(struct devlink_region *region, struct devlink_snapshot *snapshot) { struct devlink *devlink = region->devlink; lockdep_assert_held(&region->snapshot_lock); devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); region->cur_snapshots--; list_del(&snapshot->list); region->ops->destructor(snapshot->data); __devlink_snapshot_id_decrement(devlink, snapshot->id); kfree(snapshot); } int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_port *port = NULL; struct devlink_region *region; const char *region_name; struct sk_buff *msg; unsigned int index; int err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) return -EINVAL; if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) return -ENODEV; } region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, info->snd_portid, info->snd_seq, 0, region); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg, struct netlink_callback *cb, struct devlink_port *port, int *idx, int start, int flags) { struct devlink_region *region; int err = 0; list_for_each_entry(region, &port->region_list, list) { if (*idx < start) { (*idx)++; continue; } err = devlink_nl_region_fill(msg, port->devlink, DEVLINK_CMD_REGION_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, region); if (err) goto out; (*idx)++; } out: return err; } static int devlink_nl_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_region *region; struct devlink_port *port; unsigned long port_index; int idx = 0; int err; list_for_each_entry(region, &devlink->region_list, list) { if (idx < state->idx) { idx++; continue; } err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags, region); if (err) { state->idx = idx; return err; } idx++; } xa_for_each(&devlink->ports, port_index, port) { err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx, state->idx, flags); if (err) { state->idx = idx; return err; } } return 0; } int devlink_nl_region_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_region_get_dump_one); } int devlink_nl_region_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_snapshot *snapshot; struct devlink_port *port = NULL; struct devlink_region *region; const char *region_name; unsigned int index; u32 snapshot_id; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) || GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID)) return -EINVAL; region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) return -ENODEV; } if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) return -EINVAL; mutex_lock(&region->snapshot_lock); snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); if (!snapshot) { mutex_unlock(&region->snapshot_lock); return -EINVAL; } devlink_region_snapshot_del(region, snapshot); mutex_unlock(&region->snapshot_lock); return 0; } int devlink_nl_region_new_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_snapshot *snapshot; struct devlink_port *port = NULL; struct nlattr *snapshot_id_attr; struct devlink_region *region; const char *region_name; unsigned int index; u32 snapshot_id; u8 *data; int err; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) { NL_SET_ERR_MSG(info->extack, "No region name provided"); return -EINVAL; } region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) return -ENODEV; } if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) { NL_SET_ERR_MSG(info->extack, "The requested region does not exist"); return -EINVAL; } if (!region->ops->snapshot) { NL_SET_ERR_MSG(info->extack, "The requested region does not support taking an immediate snapshot"); return -EOPNOTSUPP; } mutex_lock(&region->snapshot_lock); if (region->cur_snapshots == region->max_snapshots) { NL_SET_ERR_MSG(info->extack, "The region has reached the maximum number of stored snapshots"); err = -ENOSPC; goto unlock; } snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; if (snapshot_id_attr) { snapshot_id = nla_get_u32(snapshot_id_attr); if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { NL_SET_ERR_MSG(info->extack, "The requested snapshot id is already in use"); err = -EEXIST; goto unlock; } err = __devlink_snapshot_id_insert(devlink, snapshot_id); if (err) goto unlock; } else { err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); if (err) { NL_SET_ERR_MSG(info->extack, "Failed to allocate a new snapshot id"); goto unlock; } } if (port) err = region->port_ops->snapshot(port, region->port_ops, info->extack, &data); else err = region->ops->snapshot(devlink, region->ops, info->extack, &data); if (err) goto err_snapshot_capture; err = __devlink_region_snapshot_create(region, data, snapshot_id); if (err) goto err_snapshot_create; if (!snapshot_id_attr) { struct sk_buff *msg; snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); if (WARN_ON(!snapshot)) { err = -EINVAL; goto unlock; } msg = devlink_nl_region_notify_build(region, snapshot, DEVLINK_CMD_REGION_NEW, info->snd_portid, info->snd_seq); err = PTR_ERR_OR_ZERO(msg); if (err) goto err_notify; err = genlmsg_reply(msg, info); if (err) goto err_notify; } mutex_unlock(&region->snapshot_lock); return 0; err_snapshot_create: region->ops->destructor(data); err_snapshot_capture: __devlink_snapshot_id_decrement(devlink, snapshot_id); mutex_unlock(&region->snapshot_lock); return err; err_notify: devlink_region_snapshot_del(region, snapshot); unlock: mutex_unlock(&region->snapshot_lock); return err; } static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, u8 *chunk, u32 chunk_size, u64 addr) { struct nlattr *chunk_attr; int err; chunk_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_CHUNK); if (!chunk_attr) return -EINVAL; err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk); if (err) goto nla_put_failure; err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr, DEVLINK_ATTR_PAD); if (err) goto nla_put_failure; nla_nest_end(msg, chunk_attr); return 0; nla_put_failure: nla_nest_cancel(msg, chunk_attr); return err; } #define DEVLINK_REGION_READ_CHUNK_SIZE 256 typedef int devlink_chunk_fill_t(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack *extack); static int devlink_nl_region_read_fill(struct sk_buff *skb, devlink_chunk_fill_t *cb, void *cb_priv, u64 start_offset, u64 end_offset, u64 *new_offset, struct netlink_ext_ack *extack) { u64 curr_offset = start_offset; int err = 0; u8 *data; /* Allocate and re-use a single buffer */ data = kmalloc(DEVLINK_REGION_READ_CHUNK_SIZE, GFP_KERNEL); if (!data) return -ENOMEM; *new_offset = start_offset; while (curr_offset < end_offset) { u32 data_size; data_size = min_t(u32, end_offset - curr_offset, DEVLINK_REGION_READ_CHUNK_SIZE); err = cb(cb_priv, data, data_size, curr_offset, extack); if (err) break; err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset); if (err) break; curr_offset += data_size; } *new_offset = curr_offset; kfree(data); return err; } static int devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack __always_unused *extack) { struct devlink_snapshot *snapshot = cb_priv; memcpy(chunk, &snapshot->data[curr_offset], chunk_size); return 0; } static int devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack *extack) { struct devlink_region *region = cb_priv; return region->port_ops->read(region->port, region->port_ops, extack, curr_offset, chunk_size, chunk); } static int devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, u64 curr_offset, struct netlink_ext_ack *extack) { struct devlink_region *region = cb_priv; return region->ops->read(region->devlink, region->ops, extack, curr_offset, chunk_size, chunk); } int devlink_nl_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct nlattr *chunks_attr, *region_attr, *snapshot_attr; u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->info.attrs; struct devlink_port *port = NULL; devlink_chunk_fill_t *region_cb; struct devlink_region *region; const char *region_name; struct devlink *devlink; unsigned int index; void *region_cb_priv; void *hdr; int err; start_offset = state->start_offset; devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs, false); if (IS_ERR(devlink)) return PTR_ERR(devlink); if (!attrs[DEVLINK_ATTR_REGION_NAME]) { NL_SET_ERR_MSG(cb->extack, "No region name provided"); err = -EINVAL; goto out_unlock; } if (attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]); port = devlink_port_get_by_index(devlink, index); if (!port) { err = -ENODEV; goto out_unlock; } } region_attr = attrs[DEVLINK_ATTR_REGION_NAME]; region_name = nla_data(region_attr); if (port) region = devlink_port_region_get_by_name(port, region_name); else region = devlink_region_get_by_name(devlink, region_name); if (!region) { NL_SET_ERR_MSG_ATTR(cb->extack, region_attr, "Requested region does not exist"); err = -EINVAL; goto out_unlock; } snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; if (!snapshot_attr) { if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); err = -EINVAL; goto out_unlock; } if (!region->ops->read) { NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read"); err = -EOPNOTSUPP; goto out_unlock; } if (port) region_cb = &devlink_region_port_direct_fill; else region_cb = &devlink_region_direct_fill; region_cb_priv = region; } else { struct devlink_snapshot *snapshot; u32 snapshot_id; if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot"); err = -EINVAL; goto out_unlock; } snapshot_id = nla_get_u32(snapshot_attr); snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); if (!snapshot) { NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); err = -EINVAL; goto out_unlock; } region_cb = &devlink_region_snapshot_fill; region_cb_priv = snapshot; } if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { if (!start_offset) start_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); } if (end_offset > region->size) end_offset = region->size; /* return 0 if there is no further data to read */ if (start_offset == end_offset) { err = 0; goto out_unlock; } hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, DEVLINK_CMD_REGION_READ); if (!hdr) { err = -EMSGSIZE; goto out_unlock; } err = devlink_nl_put_handle(skb, devlink); if (err) goto nla_put_failure; if (region->port) { err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX, region->port->index); if (err) goto nla_put_failure; } err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name); if (err) goto nla_put_failure; chunks_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_REGION_CHUNKS); if (!chunks_attr) { err = -EMSGSIZE; goto nla_put_failure; } err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv, start_offset, end_offset, &ret_offset, cb->extack); if (err && err != -EMSGSIZE) goto nla_put_failure; /* Check if there was any progress done to prevent infinite loop */ if (ret_offset == start_offset) { err = -EINVAL; goto nla_put_failure; } state->start_offset = ret_offset; nla_nest_end(skb, chunks_attr); genlmsg_end(skb, hdr); devl_unlock(devlink); devlink_put(devlink); return skb->len; nla_put_failure: genlmsg_cancel(skb, hdr); out_unlock: devl_unlock(devlink); devlink_put(devlink); return err; } /** * devl_region_create - create a new address region * * @devlink: devlink * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region */ struct devlink_region *devl_region_create(struct devlink *devlink, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size) { struct devlink_region *region; devl_assert_locked(devlink); if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) return ERR_PTR(-EINVAL); if (devlink_region_get_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); region = kzalloc(sizeof(*region), GFP_KERNEL); if (!region) return ERR_PTR(-ENOMEM); region->devlink = devlink; region->max_snapshots = region_max_snapshots; region->ops = ops; region->size = region_size; INIT_LIST_HEAD(&region->snapshot_list); mutex_init(&region->snapshot_lock); list_add_tail(&region->list, &devlink->region_list); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); return region; } EXPORT_SYMBOL_GPL(devl_region_create); /** * devlink_region_create - create a new address region * * @devlink: devlink * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region * * Context: Takes and release devlink->lock <mutex>. */ struct devlink_region * devlink_region_create(struct devlink *devlink, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size) { struct devlink_region *region; devl_lock(devlink); region = devl_region_create(devlink, ops, region_max_snapshots, region_size); devl_unlock(devlink); return region; } EXPORT_SYMBOL_GPL(devlink_region_create); /** * devlink_port_region_create - create a new address region for a port * * @port: devlink port * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region * * Context: Takes and release devlink->lock <mutex>. */ struct devlink_region * devlink_port_region_create(struct devlink_port *port, const struct devlink_port_region_ops *ops, u32 region_max_snapshots, u64 region_size) { struct devlink *devlink = port->devlink; struct devlink_region *region; int err = 0; ASSERT_DEVLINK_PORT_INITIALIZED(port); if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) return ERR_PTR(-EINVAL); devl_lock(devlink); if (devlink_port_region_get_by_name(port, ops->name)) { err = -EEXIST; goto unlock; } region = kzalloc(sizeof(*region), GFP_KERNEL); if (!region) { err = -ENOMEM; goto unlock; } region->devlink = devlink; region->port = port; region->max_snapshots = region_max_snapshots; region->port_ops = ops; region->size = region_size; INIT_LIST_HEAD(&region->snapshot_list); mutex_init(&region->snapshot_lock); list_add_tail(&region->list, &port->region_list); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); devl_unlock(devlink); return region; unlock: devl_unlock(devlink); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(devlink_port_region_create); /** * devl_region_destroy - destroy address region * * @region: devlink region to destroy */ void devl_region_destroy(struct devlink_region *region) { struct devlink *devlink = region->devlink; struct devlink_snapshot *snapshot, *ts; devl_assert_locked(devlink); /* Free all snapshots of region */ mutex_lock(&region->snapshot_lock); list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list) devlink_region_snapshot_del(region, snapshot); mutex_unlock(&region->snapshot_lock); list_del(&region->list); mutex_destroy(&region->snapshot_lock); devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); kfree(region); } EXPORT_SYMBOL_GPL(devl_region_destroy); /** * devlink_region_destroy - destroy address region * * @region: devlink region to destroy * * Context: Takes and release devlink->lock <mutex>. */ void devlink_region_destroy(struct devlink_region *region) { struct devlink *devlink = region->devlink; devl_lock(devlink); devl_region_destroy(region); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_region_destroy); /** * devlink_region_snapshot_id_get - get snapshot ID * * This callback should be called when adding a new snapshot, * Driver should use the same id for multiple snapshots taken * on multiple regions at the same time/by the same trigger. * * The caller of this function must use devlink_region_snapshot_id_put * when finished creating regions using this id. * * Returns zero on success, or a negative error code on failure. * * @devlink: devlink * @id: storage to return id */ int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) { return __devlink_region_snapshot_id_get(devlink, id); } EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); /** * devlink_region_snapshot_id_put - put snapshot ID reference * * This should be called by a driver after finishing creating snapshots * with an id. Doing so ensures that the ID can later be released in the * event that all snapshots using it have been destroyed. * * @devlink: devlink * @id: id to release reference on */ void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id) { __devlink_snapshot_id_decrement(devlink, id); } EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put); /** * devlink_region_snapshot_create - create a new snapshot * This will add a new snapshot of a region. The snapshot * will be stored on the region struct and can be accessed * from devlink. This is useful for future analyses of snapshots. * Multiple snapshots can be created on a region. * The @snapshot_id should be obtained using the getter function. * * @region: devlink region of the snapshot * @data: snapshot data * @snapshot_id: snapshot id to be created */ int devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id) { int err; mutex_lock(&region->snapshot_lock); err = __devlink_region_snapshot_create(region, data, snapshot_id); mutex_unlock(&region->snapshot_lock); return err; } EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
351 344 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 /* SPDX-License-Identifier: GPL-2.0-only */ /* * pm_runtime.h - Device run-time power management helper functions. * * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl> */ #ifndef _LINUX_PM_RUNTIME_H #define _LINUX_PM_RUNTIME_H #include <linux/device.h> #include <linux/notifier.h> #include <linux/pm.h> #include <linux/jiffies.h> /* Runtime PM flag argument bits */ #define RPM_ASYNC 0x01 /* Request is asynchronous */ #define RPM_NOWAIT 0x02 /* Don't wait for concurrent state change */ #define RPM_GET_PUT 0x04 /* Increment/decrement the usage_count */ #define RPM_AUTO 0x08 /* Use autosuspend_delay */ /* * Use this for defining a set of PM operations to be used in all situations * (system suspend, hibernation or runtime PM). * * Note that the behaviour differs from the deprecated UNIVERSAL_DEV_PM_OPS() * macro, which uses the provided callbacks for both runtime PM and system * sleep, while DEFINE_RUNTIME_DEV_PM_OPS() uses pm_runtime_force_suspend() * and pm_runtime_force_resume() for its system sleep callbacks. * * If the underlying dev_pm_ops struct symbol has to be exported, use * EXPORT_RUNTIME_DEV_PM_OPS() or EXPORT_GPL_RUNTIME_DEV_PM_OPS() instead. */ #define DEFINE_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ _DEFINE_DEV_PM_OPS(name, pm_runtime_force_suspend, \ pm_runtime_force_resume, suspend_fn, \ resume_fn, idle_fn) #define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ EXPORT_DEV_PM_OPS(name) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ EXPORT_GPL_DEV_PM_OPS(name) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #define EXPORT_NS_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ EXPORT_NS_DEV_PM_OPS(name, ns) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #define EXPORT_NS_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #ifdef CONFIG_PM extern struct workqueue_struct *pm_wq; static inline bool queue_pm_work(struct work_struct *work) { return queue_work(pm_wq, work); } extern int pm_generic_runtime_suspend(struct device *dev); extern int pm_generic_runtime_resume(struct device *dev); extern int pm_runtime_force_suspend(struct device *dev); extern int pm_runtime_force_resume(struct device *dev); extern int __pm_runtime_idle(struct device *dev, int rpmflags); extern int __pm_runtime_suspend(struct device *dev, int rpmflags); extern int __pm_runtime_resume(struct device *dev, int rpmflags); extern int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); extern void pm_runtime_allow(struct device *dev); extern void pm_runtime_forbid(struct device *dev); extern void pm_runtime_no_callbacks(struct device *dev); extern void pm_runtime_irq_safe(struct device *dev); extern void __pm_runtime_use_autosuspend(struct device *dev, bool use); extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay); extern u64 pm_runtime_autosuspend_expiration(struct device *dev); extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable); extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); extern void pm_runtime_drop_link(struct device_link *link); extern void pm_runtime_release_supplier(struct device_link *link); extern int devm_pm_runtime_enable(struct device *dev); /** * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. * @dev: Target device. * * Increment the runtime PM usage counter of @dev if its runtime PM status is * %RPM_ACTIVE and its runtime PM usage counter is greater than 0. */ static inline int pm_runtime_get_if_in_use(struct device *dev) { return pm_runtime_get_if_active(dev, false); } /** * pm_suspend_ignore_children - Set runtime PM behavior regarding children. * @dev: Target device. * @enable: Whether or not to ignore possible dependencies on children. * * The dependencies of @dev on its children will not be taken into account by * the runtime PM framework going forward if @enable is %true, or they will * be taken into account otherwise. */ static inline void pm_suspend_ignore_children(struct device *dev, bool enable) { dev->power.ignore_children = enable; } /** * pm_runtime_get_noresume - Bump up runtime PM usage counter of a device. * @dev: Target device. */ static inline void pm_runtime_get_noresume(struct device *dev) { atomic_inc(&dev->power.usage_count); } /** * pm_runtime_put_noidle - Drop runtime PM usage counter of a device. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev unless it is 0 already. */ static inline void pm_runtime_put_noidle(struct device *dev) { atomic_add_unless(&dev->power.usage_count, -1, 0); } /** * pm_runtime_suspended - Check whether or not a device is runtime-suspended. * @dev: Target device. * * Return %true if runtime PM is enabled for @dev and its runtime PM status is * %RPM_SUSPENDED, or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev and its runtime PM * status cannot change. */ static inline bool pm_runtime_suspended(struct device *dev) { return dev->power.runtime_status == RPM_SUSPENDED && !dev->power.disable_depth; } /** * pm_runtime_active - Check whether or not a device is runtime-active. * @dev: Target device. * * Return %true if runtime PM is disabled for @dev or its runtime PM status is * %RPM_ACTIVE, or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev and its runtime PM * status cannot change. */ static inline bool pm_runtime_active(struct device *dev) { return dev->power.runtime_status == RPM_ACTIVE || dev->power.disable_depth; } /** * pm_runtime_status_suspended - Check if runtime PM status is "suspended". * @dev: Target device. * * Return %true if the runtime PM status of @dev is %RPM_SUSPENDED, or %false * otherwise, regardless of whether or not runtime PM has been enabled for @dev. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which the * runtime PM status of @dev cannot change. */ static inline bool pm_runtime_status_suspended(struct device *dev) { return dev->power.runtime_status == RPM_SUSPENDED; } /** * pm_runtime_enabled - Check if runtime PM is enabled. * @dev: Target device. * * Return %true if runtime PM is enabled for @dev or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev. */ static inline bool pm_runtime_enabled(struct device *dev) { return !dev->power.disable_depth; } /** * pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present. * @dev: Target device. * * Return %true if @dev is a special device without runtime PM callbacks or * %false otherwise. */ static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return dev->power.no_callbacks; } /** * pm_runtime_mark_last_busy - Update the last access time of a device. * @dev: Target device. * * Update the last access time of @dev used by the runtime PM autosuspend * mechanism to the current time as returned by ktime_get_mono_fast_ns(). */ static inline void pm_runtime_mark_last_busy(struct device *dev) { WRITE_ONCE(dev->power.last_busy, ktime_get_mono_fast_ns()); } /** * pm_runtime_is_irq_safe - Check if runtime PM can work in interrupt context. * @dev: Target device. * * Return %true if @dev has been marked as an "IRQ-safe" device (with respect * to runtime PM), in which case its runtime PM callabcks can be expected to * work correctly when invoked from interrupt handlers. */ static inline bool pm_runtime_is_irq_safe(struct device *dev) { return dev->power.irq_safe; } extern u64 pm_runtime_suspended_time(struct device *dev); #else /* !CONFIG_PM */ static inline bool queue_pm_work(struct work_struct *work) { return false; } static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; } static inline int pm_generic_runtime_resume(struct device *dev) { return 0; } static inline int pm_runtime_force_suspend(struct device *dev) { return 0; } static inline int pm_runtime_force_resume(struct device *dev) { return 0; } static inline int __pm_runtime_idle(struct device *dev, int rpmflags) { return -ENOSYS; } static inline int __pm_runtime_suspend(struct device *dev, int rpmflags) { return -ENOSYS; } static inline int __pm_runtime_resume(struct device *dev, int rpmflags) { return 1; } static inline int pm_schedule_suspend(struct device *dev, unsigned int delay) { return -ENOSYS; } static inline int pm_runtime_get_if_in_use(struct device *dev) { return -EINVAL; } static inline int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count) { return -EINVAL; } static inline int __pm_runtime_set_status(struct device *dev, unsigned int status) { return 0; } static inline int pm_runtime_barrier(struct device *dev) { return 0; } static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} static inline void pm_runtime_allow(struct device *dev) {} static inline void pm_runtime_forbid(struct device *dev) {} static inline int devm_pm_runtime_enable(struct device *dev) { return 0; } static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {} static inline void pm_runtime_get_noresume(struct device *dev) {} static inline void pm_runtime_put_noidle(struct device *dev) {} static inline bool pm_runtime_suspended(struct device *dev) { return false; } static inline bool pm_runtime_active(struct device *dev) { return true; } static inline bool pm_runtime_status_suspended(struct device *dev) { return false; } static inline bool pm_runtime_enabled(struct device *dev) { return false; } static inline void pm_runtime_no_callbacks(struct device *dev) {} static inline void pm_runtime_irq_safe(struct device *dev) {} static inline bool pm_runtime_is_irq_safe(struct device *dev) { return false; } static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return false; } static inline void pm_runtime_mark_last_busy(struct device *dev) {} static inline void __pm_runtime_use_autosuspend(struct device *dev, bool use) {} static inline void pm_runtime_set_autosuspend_delay(struct device *dev, int delay) {} static inline u64 pm_runtime_autosuspend_expiration( struct device *dev) { return 0; } static inline void pm_runtime_set_memalloc_noio(struct device *dev, bool enable){} static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} static inline void pm_runtime_drop_link(struct device_link *link) {} static inline void pm_runtime_release_supplier(struct device_link *link) {} #endif /* !CONFIG_PM */ /** * pm_runtime_idle - Conditionally set up autosuspend of a device or suspend it. * @dev: Target device. * * Invoke the "idle check" callback of @dev and, depending on its return value, * set up autosuspend of @dev or suspend it (depending on whether or not * autosuspend has been enabled for it). */ static inline int pm_runtime_idle(struct device *dev) { return __pm_runtime_idle(dev, 0); } /** * pm_runtime_suspend - Suspend a device synchronously. * @dev: Target device. */ static inline int pm_runtime_suspend(struct device *dev) { return __pm_runtime_suspend(dev, 0); } /** * pm_runtime_autosuspend - Set up autosuspend of a device or suspend it. * @dev: Target device. * * Set up autosuspend of @dev or suspend it (depending on whether or not * autosuspend is enabled for it) without engaging its "idle check" callback. */ static inline int pm_runtime_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_AUTO); } /** * pm_runtime_resume - Resume a device synchronously. * @dev: Target device. */ static inline int pm_runtime_resume(struct device *dev) { return __pm_runtime_resume(dev, 0); } /** * pm_request_idle - Queue up "idle check" execution for a device. * @dev: Target device. * * Queue up a work item to run an equivalent of pm_runtime_idle() for @dev * asynchronously. */ static inline int pm_request_idle(struct device *dev) { return __pm_runtime_idle(dev, RPM_ASYNC); } /** * pm_request_resume - Queue up runtime-resume of a device. * @dev: Target device. */ static inline int pm_request_resume(struct device *dev) { return __pm_runtime_resume(dev, RPM_ASYNC); } /** * pm_request_autosuspend - Queue up autosuspend of a device. * @dev: Target device. * * Queue up a work item to run an equivalent pm_runtime_autosuspend() for @dev * asynchronously. */ static inline int pm_request_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_ASYNC | RPM_AUTO); } /** * pm_runtime_get - Bump up usage counter and queue up resume of a device. * @dev: Target device. * * Bump up the runtime PM usage counter of @dev and queue up a work item to * carry out runtime-resume of it. */ static inline int pm_runtime_get(struct device *dev) { return __pm_runtime_resume(dev, RPM_GET_PUT | RPM_ASYNC); } /** * pm_runtime_get_sync - Bump up usage counter of a device and resume it. * @dev: Target device. * * Bump up the runtime PM usage counter of @dev and carry out runtime-resume of * it synchronously. * * The possible return values of this function are the same as for * pm_runtime_resume() and the runtime PM usage counter of @dev remains * incremented in all cases, even if it returns an error code. * Consider using pm_runtime_resume_and_get() instead of it, especially * if its return value is checked by the caller, as this is likely to result * in cleaner code. */ static inline int pm_runtime_get_sync(struct device *dev) { return __pm_runtime_resume(dev, RPM_GET_PUT); } /** * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. * @dev: Target device. * * Resume @dev synchronously and if that is successful, increment its runtime * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been * incremented or a negative error code otherwise. */ static inline int pm_runtime_resume_and_get(struct device *dev) { int ret; ret = __pm_runtime_resume(dev, RPM_GET_PUT); if (ret < 0) { pm_runtime_put_noidle(dev); return ret; } return 0; } /** * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_idle(). */ static inline int pm_runtime_put(struct device *dev) { return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } /** * pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). */ static inline int pm_runtime_put_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_ASYNC | RPM_AUTO); } /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, invoke the "idle check" callback of @dev and, depending on its * return value, set up autosuspend of @dev or suspend it (depending on whether * or not autosuspend has been enabled for it). * * The possible return values of this function are the same as for * pm_runtime_idle() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync(struct device *dev) { return __pm_runtime_idle(dev, RPM_GET_PUT); } /** * pm_runtime_put_sync_suspend - Drop device usage counter and suspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, carry out runtime-suspend of @dev synchronously. * * The possible return values of this function are the same as for * pm_runtime_suspend() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync_suspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT); } /** * pm_runtime_put_sync_autosuspend - Drop device usage counter and autosuspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, set up autosuspend of @dev or suspend it synchronously (depending * on whether or not autosuspend has been enabled for it). * * The possible return values of this function are the same as for * pm_runtime_autosuspend() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO); } /** * pm_runtime_set_active - Set runtime PM status to "active". * @dev: Target device. * * Set the runtime PM status of @dev to %RPM_ACTIVE and ensure that dependencies * of it will be taken into account. * * It is not valid to call this function for devices with runtime PM enabled. */ static inline int pm_runtime_set_active(struct device *dev) { return __pm_runtime_set_status(dev, RPM_ACTIVE); } /** * pm_runtime_set_suspended - Set runtime PM status to "suspended". * @dev: Target device. * * Set the runtime PM status of @dev to %RPM_SUSPENDED and ensure that * dependencies of it will be taken into account. * * It is not valid to call this function for devices with runtime PM enabled. */ static inline int pm_runtime_set_suspended(struct device *dev) { return __pm_runtime_set_status(dev, RPM_SUSPENDED); } /** * pm_runtime_disable - Disable runtime PM for a device. * @dev: Target device. * * Prevent the runtime PM framework from working with @dev (by incrementing its * "blocking" counter). * * For each invocation of this function for @dev there must be a matching * pm_runtime_enable() call in order for runtime PM to be enabled for it. */ static inline void pm_runtime_disable(struct device *dev) { __pm_runtime_disable(dev, true); } /** * pm_runtime_use_autosuspend - Allow autosuspend to be used for a device. * @dev: Target device. * * Allow the runtime PM autosuspend mechanism to be used for @dev whenever * requested (or "autosuspend" will be handled as direct runtime-suspend for * it). * * NOTE: It's important to undo this with pm_runtime_dont_use_autosuspend() * at driver exit time unless your driver initially enabled pm_runtime * with devm_pm_runtime_enable() (which handles it for you). */ static inline void pm_runtime_use_autosuspend(struct device *dev) { __pm_runtime_use_autosuspend(dev, true); } /** * pm_runtime_dont_use_autosuspend - Prevent autosuspend from being used. * @dev: Target device. * * Prevent the runtime PM autosuspend mechanism from being used for @dev which * means that "autosuspend" will be handled as direct runtime-suspend for it * going forward. */ static inline void pm_runtime_dont_use_autosuspend(struct device *dev) { __pm_runtime_use_autosuspend(dev, false); } #endif
177 143 1336 9 2810 65 2 9 2810 65 2807 65 110 7 279 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/srcu.h> #include <linux/interval_tree.h> struct mmu_notifier_subscriptions; struct mmu_notifier; struct mmu_notifier_range; struct mmu_interval_notifier; /** * enum mmu_notifier_event - reason for the mmu notifier callback * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that * move the range * * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like * madvise() or replacing a page by another one, ...). * * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range * ie using the vma access permission (vm_page_prot) to update the whole range * is enough no need to inspect changes to the CPU page table (mprotect() * syscall) * * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for * pages in the range so to mirror those changes the user must inspect the CPU * page table (from the end callback). * * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same * access flags). User should soft dirty the page in the end callback to make * sure that anyone relying on soft dirtiness catch pages that might be written * through non CPU mappings. * * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal * that the mm refcount is zero and the range is no longer accessible. * * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal * a device driver to possibly ignore the invalidation if the * owner field matches the driver's device private pgmap owner. * * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no * longer have exclusive access to the page. When sent during creation of an * exclusive range the owner will be initialised to the value provided by the * caller of make_device_exclusive_range(), otherwise the owner will be NULL. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, MMU_NOTIFY_CLEAR, MMU_NOTIFY_PROTECTION_VMA, MMU_NOTIFY_PROTECTION_PAGE, MMU_NOTIFY_SOFT_DIRTY, MMU_NOTIFY_RELEASE, MMU_NOTIFY_MIGRATE, MMU_NOTIFY_EXCLUSIVE, }; #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_ops { /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are * freed. This can run concurrently with other mmu notifier * methods (the ones invoked outside the mm context) and it * should tear down all secondary mmu mappings and freeze the * secondary mmu. If this method isn't implemented you've to * be sure that nothing could possibly write to the pages * through the secondary mmu by the time the last thread with * tsk->mm == mm exits. * * As side note: the pages freed after ->release returns could * be immediately reallocated by the gart at an alias physical * address with a different cache model, so if ->release isn't * implemented because all _software_ driven memory accesses * through the secondary mmu are terminated by the time the * last thread of this mm quits, you've also to be sure that * speculative _hardware_ operations can't allocate dirty * cachelines in the cpu that could not be snooped and made * coherent with the other read and write operations happening * through the gart alias address, so leading to memory * corruption. */ void (*release)(struct mmu_notifier *subscription, struct mm_struct *mm); /* * clear_flush_young is called after the VM is * test-and-clearing the young/accessed bitflag in the * pte. This way the VM will provide proper aging to the * accesses to the page through the secondary MMUs and not * only to the ones through the Linux pte. * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ int (*clear_flush_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * clear_young is a lightweight version of clear_flush_young. Like the * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ int (*clear_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * test_young is called to check the young/accessed bitflag in * the secondary pte. This is used to know if the page is * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ int (*test_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address); /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. */ void (*change_pte)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address, pte_t pte); /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_lock and/or the * locks protecting the reverse maps are held. If the subsystem * can't guarantee that no additional references are taken to * the pages in the range, it has to implement the * invalidate_range() notifier to remove any references taken * after invalidate_range_start(). * * Invalidation of multiple concurrent ranges may be * optionally permitted by the driver. Either way the * establishment of sptes is forbidden in the range passed to * invalidate_range_begin/end for the whole duration of the * invalidate_range_begin/end critical section. * * invalidate_range_start() is called when all pages in the * range are still mapped and have at least a refcount of one. * * invalidate_range_end() is called when all pages in the * range have been unmapped and the pages have been freed by * the VM. * * The VM will remove the page table entries and potentially * the page between invalidate_range_start() and * invalidate_range_end(). If the page must not be freed * because of pending I/O or other circumstances then the * invalidate_range_start() callback (or the initial mapping * by the driver) must make sure that the refcount is kept * elevated. * * If the driver increases the refcount when the pages are * initially mapped into an address space then either * invalidate_range_start() or invalidate_range_end() may * decrease the refcount. If the refcount is decreased on * invalidate_range_start() then the VM can free pages as page * table entries are removed. If the refcount is only * dropped on invalidate_range_end() then the driver itself * will drop the last refcount but it must take care to flush * any secondary tlb before doing the final free on the * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. * * If blockable argument is set to false then the callback cannot * sleep and has to return with -EAGAIN if sleeping would be required. * 0 should be returned otherwise. Please note that notifiers that can * fail invalidate_range_start are not allowed to implement * invalidate_range_end, as there is no mechanism for informing the * notifier that its start failed. */ int (*invalidate_range_start)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); void (*invalidate_range_end)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); /* * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB * which shares page-tables with the CPU. The * invalidate_range_start()/end() callbacks should not be implemented as * invalidate_secondary_tlbs() already catches the points in time when * an external TLB needs to be flushed. * * This requires arch_invalidate_secondary_tlbs() to be called while * holding the ptl spin-lock and therefore this callback is not allowed * to sleep. * * This is called by architecture code whenever invalidating a TLB * entry. It is assumed that any secondary TLB has the same rules for * when invalidations are required. If this is not the case architecture * code will need to call this explicitly when required for secondary * TLB invalidation. */ void (*arch_invalidate_secondary_tlbs)( struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * These callbacks are used with the get/put interface to manage the * lifetime of the mmu_notifier memory. alloc_notifier() returns a new * notifier for use with the mm. * * free_notifier() is only called after the mmu_notifier has been * fully put, calls to any ops callback are prevented and no ops * callbacks are currently running. It is called from a SRCU callback * and cannot sleep. */ struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); void (*free_notifier)(struct mmu_notifier *subscription); }; /* * The notifier chains are protected by mmap_lock and/or the reverse map * semaphores. Notifier chains are only changed when all reverse maps and * the mmap_lock locks are taken. * * Therefore notifier chains can only be traversed when either * * 1. mmap_lock is held. * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; struct mm_struct *mm; struct rcu_head rcu; unsigned int users; }; /** * struct mmu_interval_notifier_ops * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. */ struct mmu_interval_notifier_ops { bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); }; struct mmu_interval_notifier { struct interval_tree_node interval_tree; const struct mmu_interval_notifier_ops *ops; struct mm_struct *mm; struct hlist_node deferred_item; unsigned long invalidate_seq; }; #ifdef CONFIG_MMU_NOTIFIER #ifdef CONFIG_LOCKDEP extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; enum mmu_notifier_event event; void *owner; }; static inline int mm_has_notifiers(struct mm_struct *mm) { return unlikely(mm->notifier_subscriptions); } struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, struct mm_struct *mm); static inline struct mmu_notifier * mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) { struct mmu_notifier *ret; mmap_write_lock(mm); ret = mmu_notifier_get_locked(ops, mm); mmap_write_unlock(mm); return ret; } void mmu_notifier_put(struct mmu_notifier *subscription); void mmu_notifier_synchronize(void); extern int mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern int __mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *subscription, struct mm_struct *mm); unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub); int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); int mmu_interval_notifier_insert_locked( struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub); /** * mmu_interval_set_seq - Save the invalidation sequence * @interval_sub - The subscription passed to invalidate * @cur_seq - The cur_seq passed to the invalidate() callback * * This must be called unconditionally from the invalidate callback of a * struct mmu_interval_notifier_ops under the same lock that is used to call * mmu_interval_read_retry(). It updates the sequence number for later use by * mmu_interval_read_retry(). The provided cur_seq will always be odd. * * If the caller does not call mmu_interval_read_begin() or * mmu_interval_read_retry() then this call is not required. */ static inline void mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, unsigned long cur_seq) { WRITE_ONCE(interval_sub->invalidate_seq, cur_seq); } /** * mmu_interval_read_retry - End a read side critical section against a VA range * interval_sub: The subscription * seq: The return of the paired mmu_interval_read_begin() * * This MUST be called under a user provided lock that is also held * unconditionally by op->invalidate() when it calls mmu_interval_set_seq(). * * Each call should be paired with a single mmu_interval_read_begin() and * should be used to conclude the read side. * * Returns true if an invalidation collided with this critical section, and * the caller should retry. */ static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { return interval_sub->invalidate_seq != seq; } /** * mmu_interval_check_retry - Test if a collision has occurred * interval_sub: The subscription * seq: The return of the matching mmu_interval_read_begin() * * This can be used in the critical section between mmu_interval_read_begin() * and mmu_interval_read_retry(). A return of true indicates an invalidation * has collided with this critical region and a future * mmu_interval_read_retry() will return true. * * False is not reliable and only suggests a collision may not have * occurred. It can be called many times and does not have to hold the user * provided lock. * * This call can be used as part of loops and other expensive operations to * expedite a retry. */ static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ return READ_ONCE(interval_sub->invalidate_seq) != seq; } extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_release(mm); } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_flush_young(mm, start, end); return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_young(mm, start, end); return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { if (mm_has_notifiers(mm)) return __mmu_notifier_test_young(mm, address); return 0; } static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { if (mm_has_notifiers(mm)) __mmu_notifier_change_pte(mm, address, pte); } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { might_sleep(); lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); } /* * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it * can return an error if a notifier can't proceed without blocking, in which * case you're not allowed to modify PTEs in the specified range. * * This is mainly intended for OOM handling. */ static inline int __must_check mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { int ret = 0; lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; ret = __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); return ret; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { if (mmu_notifier_range_blockable(range)) might_sleep(); if (mm_has_notifiers(range->mm)) __mmu_notifier_invalidate_range_end(range); } static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { mm->notifier_subscriptions = NULL; } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_subscriptions_destroy(mm); } static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, struct mm_struct *mm, unsigned long start, unsigned long end) { range->event = event; range->mm = mm; range->start = start; range->end = end; range->flags = flags; } static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, struct mm_struct *mm, unsigned long start, unsigned long end, void *owner) { mmu_notifier_range_init(range, event, flags, mm, start, end); range->owner = owner; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PAGE_SIZE); \ __young; \ }) #define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PMD_SIZE); \ __young; \ }) #define ptep_clear_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PAGE_SIZE); \ __young; \ }) #define pmdp_clear_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PMD_SIZE); \ __young; \ }) /* * set_pte_at_notify() sets the pte _after_ running the notifier. * This is safe to start by updating the secondary MMUs, because the primary MMU * pte invalidate must have already happened with a ptep_clear_flush() before * set_pte_at_notify() has been invoked. Updating the secondary MMUs first is * required when we change both the protection of the mapping from read-only to * read-write and the pfn (like during copy on write page faults). Otherwise the * old page would remain mapped readonly in the secondary MMUs after the new * page is already writable by some CPU through the primary MMU. */ #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ ({ \ struct mm_struct *___mm = __mm; \ unsigned long ___address = __address; \ pte_t ___pte = __pte; \ \ mmu_notifier_change_pte(___mm, ___address, ___pte); \ set_pte_at(___mm, ___address, __ptep, ___pte); \ }) #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { unsigned long start; unsigned long end; }; static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { range->start = start; range->end = end; } #define mmu_notifier_range_init(range,event,flags,mm,start,end) \ _mmu_notifier_range_init(range, start, end) #define mmu_notifier_range_init_owner(range, event, flags, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return true; } static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; } static inline void mmu_notifier_release(struct mm_struct *mm) { } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { return 0; } static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { return 0; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { } #define mmu_notifier_range_update_to_read_only(r) false #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young #define pmdp_clear_young_notify pmdp_test_and_clear_young #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush #define pudp_huge_clear_flush_notify pudp_huge_clear_flush #define set_pte_at_notify set_pte_at static inline void mmu_notifier_synchronize(void) { } #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */
21 99 1511 78 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * * Copyright (C) 1997, Stephen Tweedie * * Provide stub functions for unreadable inodes * * Fabian Frederick : August 2003 - All file operations assigned to EIO */ #include <linux/fs.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } static const struct file_operations bad_file_ops = { .open = bad_file_open, }; static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return -EIO; } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EIO; } static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, int buflen) { return -EIO; } static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; } static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } static const char *bad_inode_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return ERR_PTR(-EIO); } static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } static int bad_inode_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { return -EIO; } static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, umode_t create_mode) { return -EIO; } static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; } static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, .link = bad_inode_link, .unlink = bad_inode_unlink, .symlink = bad_inode_symlink, .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, .tmpfile = bad_inode_tmpfile, .set_acl = bad_inode_set_acl, }; /* * When a filesystem is unable to read an inode due to an I/O error in * its read_inode() function, it can call make_bad_inode() to return a * set of stubs which will return EIO errors as required. * * We only need to do limited initialisation: all other fields are * preinitialised to zero automatically. */ /** * make_bad_inode - mark an inode bad due to an I/O error * @inode: Inode to mark bad * * When an inode cannot be read due to a media or remote network * failure this function makes the inode "bad" and causes I/O operations * on it to fail from this point on. */ void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); inode->i_mode = S_IFREG; simple_inode_init_ts(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); /* * This tests whether an inode has been flagged as bad. The test uses * &bad_inode_ops to cover the case of invalidated inodes as well as * those created by make_bad_inode() above. */ /** * is_bad_inode - is an inode errored * @inode: inode to test * * Returns true if the inode in question has been marked as bad. */ bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } EXPORT_SYMBOL(is_bad_inode); /** * iget_failed - Mark an under-construction inode as dead and release it * @inode: The inode to discard * * Mark an under-construction inode as dead and release it. */ void iget_failed(struct inode *inode) { make_bad_inode(inode); unlock_new_inode(inode); iput(inode); } EXPORT_SYMBOL(iget_failed);
8 1 1 7 9 7 3 10 10 9 9 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/symlink.c - operations for initializing and mounting sysfs * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> #include <linux/magic.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <net/net_namespace.h> #include "sysfs.h" static struct kernfs_root *sysfs_root; struct kernfs_node *sysfs_root_kn; static int sysfs_get_tree(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; int ret; ret = kernfs_get_tree(fc); if (ret) return ret; if (kfc->new_sb_created) fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; return 0; } static void sysfs_fs_context_free(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; if (kfc->ns_tag) kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag); kernfs_free_fs_context(fc); kfree(kfc); } static const struct fs_context_operations sysfs_fs_context_ops = { .free = sysfs_fs_context_free, .get_tree = sysfs_get_tree, }; static int sysfs_init_fs_context(struct fs_context *fc) { struct kernfs_fs_context *kfc; struct net *netns; if (!(fc->sb_flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) return -EPERM; } kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL); if (!kfc) return -ENOMEM; kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); kfc->root = sysfs_root; kfc->magic = SYSFS_MAGIC; fc->fs_private = kfc; fc->ops = &sysfs_fs_context_ops; if (netns) { put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(netns->user_ns); } fc->global = true; return 0; } static void sysfs_kill_sb(struct super_block *sb) { void *ns = (void *)kernfs_super_ns(sb); kernfs_kill_sb(sb); kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); } static struct file_system_type sysfs_fs_type = { .name = "sysfs", .init_fs_context = sysfs_init_fs_context, .kill_sb = sysfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) { int err; sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, NULL); if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); sysfs_root_kn = kernfs_root_to_node(sysfs_root); err = register_filesystem(&sysfs_fs_type); if (err) { kernfs_destroy_root(sysfs_root); return err; } return 0; }
14 67 3078 511 248 281 482 479 71 2304 2304 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 // SPDX-License-Identifier: GPL-2.0 /* * Out-of-line refcount functions. */ #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/bug.h> #define REFCOUNT_WARN(str) WARN_ONCE(1, "refcount_t: " str ".\n") void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) { refcount_set(r, REFCOUNT_SATURATED); switch (t) { case REFCOUNT_ADD_NOT_ZERO_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_UAF: REFCOUNT_WARN("addition on 0; use-after-free"); break; case REFCOUNT_SUB_UAF: REFCOUNT_WARN("underflow; use-after-free"); break; case REFCOUNT_DEC_LEAK: REFCOUNT_WARN("decrement hit 0; leaking memory"); break; default: REFCOUNT_WARN("unknown saturation event!?"); } } EXPORT_SYMBOL(refcount_warn_saturate); /** * refcount_dec_if_one - decrement a refcount if it is 1 * @r: the refcount * * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the * success thereof. * * Like all decrement operations, it provides release memory order and provides * a control dependency. * * It can be used like a try-delete operator; this explicit case is provided * and not cmpxchg in generic, because that would allow implementing unsafe * operations. * * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_if_one(refcount_t *r) { int val = 1; return atomic_try_cmpxchg_release(&r->refs, &val, 0); } EXPORT_SYMBOL(refcount_dec_if_one); /** * refcount_dec_not_one - decrement a refcount if it is not 1 * @r: the refcount * * No atomic_t counterpart, it decrements unless the value is 1, in which case * it will return false. * * Was often done like: atomic_add_unless(&var, -1, 1) * * Return: true if the decrement operation was successful, false otherwise */ bool refcount_dec_not_one(refcount_t *r) { unsigned int new, val = atomic_read(&r->refs); do { if (unlikely(val == REFCOUNT_SATURATED)) return true; if (val == 1) return false; new = val - 1; if (new > val) { WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n"); return true; } } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return true; } EXPORT_SYMBOL(refcount_dec_not_one); /** * refcount_dec_and_mutex_lock - return holding mutex if able to decrement * refcount to 0 * @r: the refcount * @lock: the mutex to be locked * * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail * to decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold mutex if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) { if (refcount_dec_not_one(r)) return false; mutex_lock(lock); if (!refcount_dec_and_test(r)) { mutex_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_mutex_lock); /** * refcount_dec_and_lock - return holding spinlock if able to decrement * refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) { if (refcount_dec_not_one(r)) return false; spin_lock(lock); if (!refcount_dec_and_test(r)) { spin_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock); /** * refcount_dec_and_lock_irqsave - return holding spinlock with disabled * interrupts if able to decrement refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * @flags: saved IRQ-flags if the is acquired * * Same as refcount_dec_and_lock() above except that the spinlock is acquired * with disabled interrupts. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) { if (refcount_dec_not_one(r)) return false; spin_lock_irqsave(lock, *flags); if (!refcount_dec_and_test(r)) { spin_unlock_irqrestore(lock, *flags); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock_irqsave);
243 52 75 7 165 3 12 11 8 39 19 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 /* SPDX-License-Identifier: GPL-2.0 */ /* * Events for filesystem locks * * Copyright 2013 Jeff Layton <jlayton@poochiereds.net> */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filelock #if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILELOCK_H #include <linux/tracepoint.h> #include <linux/fs.h> #include <linux/device.h> #include <linux/kdev_t.h> #define show_fl_flags(val) \ __print_flags(val, "|", \ { FL_POSIX, "FL_POSIX" }, \ { FL_FLOCK, "FL_FLOCK" }, \ { FL_DELEG, "FL_DELEG" }, \ { FL_ACCESS, "FL_ACCESS" }, \ { FL_EXISTS, "FL_EXISTS" }, \ { FL_LEASE, "FL_LEASE" }, \ { FL_CLOSE, "FL_CLOSE" }, \ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ { FL_OFDLCK, "FL_OFDLCK" }) #define show_fl_type(val) \ __print_symbolic(val, \ { F_RDLCK, "F_RDLCK" }, \ { F_WRLCK, "F_WRLCK" }, \ { F_UNLCK, "F_UNLCK" }) TRACE_EVENT(locks_get_lock_context, TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx), TP_ARGS(inode, type, ctx), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(unsigned char, type) __field(struct file_lock_context *, ctx) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->type = type; __entry->ctx = ctx; ), TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, show_fl_type(__entry->type), __entry->ctx) ); DECLARE_EVENT_CLASS(filelock_lock, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock *, fl_blocker) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_pid) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) __field(loff_t, fl_start) __field(loff_t, fl_end) __field(int, ret) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->fl_blocker = fl ? fl->fl_blocker : NULL; __entry->fl_owner = fl ? fl->fl_owner : NULL; __entry->fl_pid = fl ? fl->fl_pid : 0; __entry->fl_flags = fl ? fl->fl_flags : 0; __entry->fl_type = fl ? fl->fl_type : 0; __entry->fl_start = fl ? fl->fl_start : 0; __entry->fl_end = fl ? fl->fl_end : 0; __entry->ret = ret; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->fl_blocker, __entry->fl_owner, __entry->fl_pid, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type), __entry->fl_start, __entry->fl_end, __entry->ret) ); DEFINE_EVENT(filelock_lock, posix_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, fcntl_setlk, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, locks_remove_posix, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DEFINE_EVENT(filelock_lock, flock_lock_inode, TP_PROTO(struct inode *inode, struct file_lock *fl, int ret), TP_ARGS(inode, fl, ret)); DECLARE_EVENT_CLASS(filelock_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(struct file_lock *, fl) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(struct file_lock *, fl_blocker) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) __field(unsigned long, fl_break_time) __field(unsigned long, fl_downgrade_time) ), TP_fast_assign( __entry->fl = fl ? fl : NULL; __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->fl_blocker = fl ? fl->fl_blocker : NULL; __entry->fl_owner = fl ? fl->fl_owner : NULL; __entry->fl_flags = fl ? fl->fl_flags : 0; __entry->fl_type = fl ? fl->fl_type : 0; __entry->fl_break_time = fl ? fl->fl_break_time : 0; __entry->fl_downgrade_time = fl ? fl->fl_downgrade_time : 0; ), TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu", __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->fl_blocker, __entry->fl_owner, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type), __entry->fl_break_time, __entry->fl_downgrade_time) ); DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl)); TRACE_EVENT(generic_add_lease, TP_PROTO(struct inode *inode, struct file_lock *fl), TP_ARGS(inode, fl), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(int, wcount) __field(int, rcount) __field(int, icount) __field(dev_t, s_dev) __field(fl_owner_t, fl_owner) __field(unsigned int, fl_flags) __field(unsigned char, fl_type) ), TP_fast_assign( __entry->s_dev = inode->i_sb->s_dev; __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); __entry->icount = atomic_read(&inode->i_count); __entry->fl_owner = fl->fl_owner; __entry->fl_flags = fl->fl_flags; __entry->fl_type = fl->fl_type; ), TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->wcount, __entry->rcount, __entry->icount, __entry->fl_owner, show_fl_flags(__entry->fl_flags), show_fl_type(__entry->fl_type)) ); TRACE_EVENT(leases_conflict, TP_PROTO(bool conflict, struct file_lock *lease, struct file_lock *breaker), TP_ARGS(conflict, lease, breaker), TP_STRUCT__entry( __field(void *, lease) __field(void *, breaker) __field(unsigned int, l_fl_flags) __field(unsigned int, b_fl_flags) __field(unsigned char, l_fl_type) __field(unsigned char, b_fl_type) __field(bool, conflict) ), TP_fast_assign( __entry->lease = lease; __entry->l_fl_flags = lease->fl_flags; __entry->l_fl_type = lease->fl_type; __entry->breaker = breaker; __entry->b_fl_flags = breaker->fl_flags; __entry->b_fl_type = breaker->fl_type; __entry->conflict = conflict; ), TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s", __entry->conflict, __entry->lease, show_fl_flags(__entry->l_fl_flags), show_fl_type(__entry->l_fl_type), __entry->breaker, show_fl_flags(__entry->b_fl_flags), show_fl_type(__entry->b_fl_type)) ); #endif /* _TRACE_FILELOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
3 5 5 1 1 1 1 2 2 2 2 2 3 3 3 3 3 3 3 3 2 2 2 2 1 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 // SPDX-License-Identifier: GPL-2.0-only /* * 6pack.c This module implements the 6pack protocol for kernel-based * devices like TTY. It interfaces between a raw TTY and the * kernel's AX.25 protocol layers. * * Authors: Andreas Könsgen <ajk@comnets.uni-bremen.de> * Ralf Baechle DL5RB <ralf@linux-mips.org> * * Quite a lot of stuff "stolen" by Joerg Reuter from slip.c, written by * * Laurence Culhane, <loz@holmes.demon.co.uk> * Fred N. van Kempen, <waltje@uwalt.nl.mugnet.org> */ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/tty.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/timer.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/semaphore.h> #include <linux/refcount.h> #define SIXPACK_VERSION "Revision: 0.3.0" /* sixpack priority commands */ #define SIXP_SEOF 0x40 /* start and end of a 6pack frame */ #define SIXP_TX_URUN 0x48 /* transmit overrun */ #define SIXP_RX_ORUN 0x50 /* receive overrun */ #define SIXP_RX_BUF_OVL 0x58 /* receive buffer overflow */ #define SIXP_CHKSUM 0xFF /* valid checksum of a 6pack frame */ /* masks to get certain bits out of the status bytes sent by the TNC */ #define SIXP_CMD_MASK 0xC0 #define SIXP_CHN_MASK 0x07 #define SIXP_PRIO_CMD_MASK 0x80 #define SIXP_STD_CMD_MASK 0x40 #define SIXP_PRIO_DATA_MASK 0x38 #define SIXP_TX_MASK 0x20 #define SIXP_RX_MASK 0x10 #define SIXP_RX_DCD_MASK 0x18 #define SIXP_LEDS_ON 0x78 #define SIXP_LEDS_OFF 0x60 #define SIXP_CON 0x08 #define SIXP_STA 0x10 #define SIXP_FOUND_TNC 0xe9 #define SIXP_CON_ON 0x68 #define SIXP_DCD_MASK 0x08 #define SIXP_DAMA_OFF 0 /* default level 2 parameters */ #define SIXP_TXDELAY 25 /* 250 ms */ #define SIXP_PERSIST 50 /* in 256ths */ #define SIXP_SLOTTIME 10 /* 100 ms */ #define SIXP_INIT_RESYNC_TIMEOUT (3*HZ/2) /* in 1 s */ #define SIXP_RESYNC_TIMEOUT 5*HZ /* in 1 s */ /* 6pack configuration. */ #define SIXP_NRUNIT 31 /* MAX number of 6pack channels */ #define SIXP_MTU 256 /* Default MTU */ enum sixpack_flags { SIXPF_ERROR, /* Parity, etc. error */ }; struct sixpack { /* Various fields. */ struct tty_struct *tty; /* ptr to TTY structure */ struct net_device *dev; /* easy for intr handling */ /* These are pointers to the malloc()ed frame buffers. */ unsigned char *rbuff; /* receiver buffer */ int rcount; /* received chars counter */ unsigned char *xbuff; /* transmitter buffer */ unsigned char *xhead; /* next byte to XMIT */ int xleft; /* bytes left in XMIT queue */ unsigned char raw_buf[4]; unsigned char cooked_buf[400]; unsigned int rx_count; unsigned int rx_count_cooked; spinlock_t rxlock; int mtu; /* Our mtu (to spot changes!) */ int buffsize; /* Max buffers sizes */ unsigned long flags; /* Flag values/ mode etc */ unsigned char mode; /* 6pack mode */ /* 6pack stuff */ unsigned char tx_delay; unsigned char persistence; unsigned char slottime; unsigned char duplex; unsigned char led_state; unsigned char status; unsigned char status1; unsigned char status2; unsigned char tx_enable; unsigned char tnc_state; struct timer_list tx_t; struct timer_list resync_t; refcount_t refcnt; struct completion dead; spinlock_t lock; }; #define AX25_6PACK_HEADER_LEN 0 static void sixpack_decode(struct sixpack *, const unsigned char[], int); static int encode_sixpack(unsigned char *, unsigned char *, int, unsigned char); /* * Perform the persistence/slottime algorithm for CSMA access. If the * persistence check was successful, write the data to the serial driver. * Note that in case of DAMA operation, the data is not sent here. */ static void sp_xmit_on_air(struct timer_list *t) { struct sixpack *sp = from_timer(sp, t, tx_t); int actual, when = sp->slottime; static unsigned char random; random = random * 17 + 41; if (((sp->status1 & SIXP_DCD_MASK) == 0) && (random < sp->persistence)) { sp->led_state = 0x70; sp->tty->ops->write(sp->tty, &sp->led_state, 1); sp->tx_enable = 1; actual = sp->tty->ops->write(sp->tty, sp->xbuff, sp->status2); sp->xleft -= actual; sp->xhead += actual; sp->led_state = 0x60; sp->tty->ops->write(sp->tty, &sp->led_state, 1); sp->status2 = 0; } else mod_timer(&sp->tx_t, jiffies + ((when + 1) * HZ) / 100); } /* ----> 6pack timer interrupt handler and friends. <---- */ /* Encapsulate one AX.25 frame and stuff into a TTY queue. */ static void sp_encaps(struct sixpack *sp, unsigned char *icp, int len) { unsigned char *msg, *p = icp; int actual, count; if (len > sp->mtu) { /* sp->mtu = AX25_MTU = max. PACLEN = 256 */ msg = "oversized transmit packet!"; goto out_drop; } if (p[0] > 5) { msg = "invalid KISS command"; goto out_drop; } if ((p[0] != 0) && (len > 2)) { msg = "KISS control packet too long"; goto out_drop; } if ((p[0] == 0) && (len < 15)) { msg = "bad AX.25 packet to transmit"; goto out_drop; } count = encode_sixpack(p, sp->xbuff, len, sp->tx_delay); set_bit(TTY_DO_WRITE_WAKEUP, &sp->tty->flags); switch (p[0]) { case 1: sp->tx_delay = p[1]; return; case 2: sp->persistence = p[1]; return; case 3: sp->slottime = p[1]; return; case 4: /* ignored */ return; case 5: sp->duplex = p[1]; return; } if (p[0] != 0) return; /* * In case of fullduplex or DAMA operation, we don't take care about the * state of the DCD or of any timers, as the determination of the * correct time to send is the job of the AX.25 layer. We send * immediately after data has arrived. */ if (sp->duplex == 1) { sp->led_state = 0x70; sp->tty->ops->write(sp->tty, &sp->led_state, 1); sp->tx_enable = 1; actual = sp->tty->ops->write(sp->tty, sp->xbuff, count); sp->xleft = count - actual; sp->xhead = sp->xbuff + actual; sp->led_state = 0x60; sp->tty->ops->write(sp->tty, &sp->led_state, 1); } else { sp->xleft = count; sp->xhead = sp->xbuff; sp->status2 = count; sp_xmit_on_air(&sp->tx_t); } return; out_drop: sp->dev->stats.tx_dropped++; netif_start_queue(sp->dev); if (net_ratelimit()) printk(KERN_DEBUG "%s: %s - dropped.\n", sp->dev->name, msg); } /* Encapsulate an IP datagram and kick it into a TTY queue. */ static netdev_tx_t sp_xmit(struct sk_buff *skb, struct net_device *dev) { struct sixpack *sp = netdev_priv(dev); if (skb->protocol == htons(ETH_P_IP)) return ax25_ip_xmit(skb); spin_lock_bh(&sp->lock); /* We were not busy, so we are now... :-) */ netif_stop_queue(dev); dev->stats.tx_bytes += skb->len; sp_encaps(sp, skb->data, skb->len); spin_unlock_bh(&sp->lock); dev_kfree_skb(skb); return NETDEV_TX_OK; } static int sp_open_dev(struct net_device *dev) { struct sixpack *sp = netdev_priv(dev); if (sp->tty == NULL) return -ENODEV; return 0; } /* Close the low-level part of the 6pack channel. */ static int sp_close(struct net_device *dev) { struct sixpack *sp = netdev_priv(dev); spin_lock_bh(&sp->lock); if (sp->tty) { /* TTY discipline is running. */ clear_bit(TTY_DO_WRITE_WAKEUP, &sp->tty->flags); } netif_stop_queue(dev); spin_unlock_bh(&sp->lock); return 0; } static int sp_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr_ax25 *sa = addr; netif_tx_lock_bh(dev); netif_addr_lock(dev); __dev_addr_set(dev, &sa->sax25_call, AX25_ADDR_LEN); netif_addr_unlock(dev); netif_tx_unlock_bh(dev); return 0; } static const struct net_device_ops sp_netdev_ops = { .ndo_open = sp_open_dev, .ndo_stop = sp_close, .ndo_start_xmit = sp_xmit, .ndo_set_mac_address = sp_set_mac_address, }; static void sp_setup(struct net_device *dev) { /* Finish setting up the DEVICE info. */ dev->netdev_ops = &sp_netdev_ops; dev->mtu = SIXP_MTU; dev->hard_header_len = AX25_MAX_HEADER_LEN; dev->header_ops = &ax25_header_ops; dev->addr_len = AX25_ADDR_LEN; dev->type = ARPHRD_AX25; dev->tx_queue_len = 10; /* Only activated in AX.25 mode */ memcpy(dev->broadcast, &ax25_bcast, AX25_ADDR_LEN); dev_addr_set(dev, (u8 *)&ax25_defaddr); dev->flags = 0; } /* Send one completely decapsulated IP datagram to the IP layer. */ /* * This is the routine that sends the received data to the kernel AX.25. * 'cmd' is the KISS command. For AX.25 data, it is zero. */ static void sp_bump(struct sixpack *sp, char cmd) { struct sk_buff *skb; int count; unsigned char *ptr; count = sp->rcount + 1; sp->dev->stats.rx_bytes += count; if ((skb = dev_alloc_skb(count + 1)) == NULL) goto out_mem; ptr = skb_put(skb, count + 1); *ptr++ = cmd; /* KISS command */ memcpy(ptr, sp->cooked_buf + 1, count); skb->protocol = ax25_type_trans(skb, sp->dev); netif_rx(skb); sp->dev->stats.rx_packets++; return; out_mem: sp->dev->stats.rx_dropped++; } /* ----------------------------------------------------------------------- */ /* * We have a potential race on dereferencing tty->disc_data, because the tty * layer provides no locking at all - thus one cpu could be running * sixpack_receive_buf while another calls sixpack_close, which zeroes * tty->disc_data and frees the memory that sixpack_receive_buf is using. The * best way to fix this is to use a rwlock in the tty struct, but for now we * use a single global rwlock for all ttys in ppp line discipline. */ static DEFINE_RWLOCK(disc_data_lock); static struct sixpack *sp_get(struct tty_struct *tty) { struct sixpack *sp; read_lock(&disc_data_lock); sp = tty->disc_data; if (sp) refcount_inc(&sp->refcnt); read_unlock(&disc_data_lock); return sp; } static void sp_put(struct sixpack *sp) { if (refcount_dec_and_test(&sp->refcnt)) complete(&sp->dead); } /* * Called by the TTY driver when there's room for more data. If we have * more packets to send, we send them here. */ static void sixpack_write_wakeup(struct tty_struct *tty) { struct sixpack *sp = sp_get(tty); int actual; if (!sp) return; if (sp->xleft <= 0) { /* Now serial buffer is almost free & we can start * transmission of another packet */ sp->dev->stats.tx_packets++; clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); sp->tx_enable = 0; netif_wake_queue(sp->dev); goto out; } if (sp->tx_enable) { actual = tty->ops->write(tty, sp->xhead, sp->xleft); sp->xleft -= actual; sp->xhead += actual; } out: sp_put(sp); } /* ----------------------------------------------------------------------- */ /* * Handle the 'receiver data ready' interrupt. * This function is called by the tty module in the kernel when * a block of 6pack data has been received, which can now be decapsulated * and sent on to some IP layer for further processing. */ static void sixpack_receive_buf(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count) { struct sixpack *sp; int count1; if (!count) return; sp = sp_get(tty); if (!sp) return; /* Read the characters out of the buffer */ count1 = count; while (count) { count--; if (fp && *fp++) { if (!test_and_set_bit(SIXPF_ERROR, &sp->flags)) sp->dev->stats.rx_errors++; continue; } } sixpack_decode(sp, cp, count1); sp_put(sp); tty_unthrottle(tty); } /* * Try to resync the TNC. Called by the resync timer defined in * decode_prio_command */ #define TNC_UNINITIALIZED 0 #define TNC_UNSYNC_STARTUP 1 #define TNC_UNSYNCED 2 #define TNC_IN_SYNC 3 static void __tnc_set_sync_state(struct sixpack *sp, int new_tnc_state) { char *msg; switch (new_tnc_state) { default: /* gcc oh piece-o-crap ... */ case TNC_UNSYNC_STARTUP: msg = "Synchronizing with TNC"; break; case TNC_UNSYNCED: msg = "Lost synchronization with TNC\n"; break; case TNC_IN_SYNC: msg = "Found TNC"; break; } sp->tnc_state = new_tnc_state; printk(KERN_INFO "%s: %s\n", sp->dev->name, msg); } static inline void tnc_set_sync_state(struct sixpack *sp, int new_tnc_state) { int old_tnc_state = sp->tnc_state; if (old_tnc_state != new_tnc_state) __tnc_set_sync_state(sp, new_tnc_state); } static void resync_tnc(struct timer_list *t) { struct sixpack *sp = from_timer(sp, t, resync_t); static char resync_cmd = 0xe8; /* clear any data that might have been received */ sp->rx_count = 0; sp->rx_count_cooked = 0; /* reset state machine */ sp->status = 1; sp->status1 = 1; sp->status2 = 0; /* resync the TNC */ sp->led_state = 0x60; sp->tty->ops->write(sp->tty, &sp->led_state, 1); sp->tty->ops->write(sp->tty, &resync_cmd, 1); /* Start resync timer again -- the TNC might be still absent */ mod_timer(&sp->resync_t, jiffies + SIXP_RESYNC_TIMEOUT); } static inline int tnc_init(struct sixpack *sp) { unsigned char inbyte = 0xe8; tnc_set_sync_state(sp, TNC_UNSYNC_STARTUP); sp->tty->ops->write(sp->tty, &inbyte, 1); mod_timer(&sp->resync_t, jiffies + SIXP_RESYNC_TIMEOUT); return 0; } /* * Open the high-level part of the 6pack channel. * This function is called by the TTY module when the * 6pack line discipline is called for. Because we are * sure the tty line exists, we only have to link it to * a free 6pcack channel... */ static int sixpack_open(struct tty_struct *tty) { char *rbuff = NULL, *xbuff = NULL; struct net_device *dev; struct sixpack *sp; unsigned long len; int err = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (tty->ops->write == NULL) return -EOPNOTSUPP; dev = alloc_netdev(sizeof(struct sixpack), "sp%d", NET_NAME_UNKNOWN, sp_setup); if (!dev) { err = -ENOMEM; goto out; } sp = netdev_priv(dev); sp->dev = dev; spin_lock_init(&sp->lock); spin_lock_init(&sp->rxlock); refcount_set(&sp->refcnt, 1); init_completion(&sp->dead); /* !!! length of the buffers. MTU is IP MTU, not PACLEN! */ len = dev->mtu * 2; rbuff = kmalloc(len + 4, GFP_KERNEL); xbuff = kmalloc(len + 4, GFP_KERNEL); if (rbuff == NULL || xbuff == NULL) { err = -ENOBUFS; goto out_free; } spin_lock_bh(&sp->lock); sp->tty = tty; sp->rbuff = rbuff; sp->xbuff = xbuff; sp->mtu = AX25_MTU + 73; sp->buffsize = len; sp->rcount = 0; sp->rx_count = 0; sp->rx_count_cooked = 0; sp->xleft = 0; sp->flags = 0; /* Clear ESCAPE & ERROR flags */ sp->duplex = 0; sp->tx_delay = SIXP_TXDELAY; sp->persistence = SIXP_PERSIST; sp->slottime = SIXP_SLOTTIME; sp->led_state = 0x60; sp->status = 1; sp->status1 = 1; sp->status2 = 0; sp->tx_enable = 0; netif_start_queue(dev); timer_setup(&sp->tx_t, sp_xmit_on_air, 0); timer_setup(&sp->resync_t, resync_tnc, 0); spin_unlock_bh(&sp->lock); /* Done. We have linked the TTY line to a channel. */ tty->disc_data = sp; tty->receive_room = 65536; /* Now we're ready to register. */ err = register_netdev(dev); if (err) goto out_free; tnc_init(sp); return 0; out_free: kfree(xbuff); kfree(rbuff); free_netdev(dev); out: return err; } /* * Close down a 6pack channel. * This means flushing out any pending queues, and then restoring the * TTY line discipline to what it was before it got hooked to 6pack * (which usually is TTY again). */ static void sixpack_close(struct tty_struct *tty) { struct sixpack *sp; write_lock_irq(&disc_data_lock); sp = tty->disc_data; tty->disc_data = NULL; write_unlock_irq(&disc_data_lock); if (!sp) return; /* * We have now ensured that nobody can start using ap from now on, but * we have to wait for all existing users to finish. */ if (!refcount_dec_and_test(&sp->refcnt)) wait_for_completion(&sp->dead); /* We must stop the queue to avoid potentially scribbling * on the free buffers. The sp->dead completion is not sufficient * to protect us from sp->xbuff access. */ netif_stop_queue(sp->dev); unregister_netdev(sp->dev); del_timer_sync(&sp->tx_t); del_timer_sync(&sp->resync_t); /* Free all 6pack frame buffers after unreg. */ kfree(sp->rbuff); kfree(sp->xbuff); free_netdev(sp->dev); } /* Perform I/O control on an active 6pack channel. */ static int sixpack_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct sixpack *sp = sp_get(tty); struct net_device *dev; unsigned int tmp, err; if (!sp) return -ENXIO; dev = sp->dev; switch(cmd) { case SIOCGIFNAME: err = copy_to_user((void __user *) arg, dev->name, strlen(dev->name) + 1) ? -EFAULT : 0; break; case SIOCGIFENCAP: err = put_user(0, (int __user *) arg); break; case SIOCSIFENCAP: if (get_user(tmp, (int __user *) arg)) { err = -EFAULT; break; } sp->mode = tmp; dev->addr_len = AX25_ADDR_LEN; dev->hard_header_len = AX25_KISS_HEADER_LEN + AX25_MAX_HEADER_LEN + 3; dev->type = ARPHRD_AX25; err = 0; break; case SIOCSIFHWADDR: { char addr[AX25_ADDR_LEN]; if (copy_from_user(&addr, (void __user *)arg, AX25_ADDR_LEN)) { err = -EFAULT; break; } netif_tx_lock_bh(dev); __dev_addr_set(dev, &addr, AX25_ADDR_LEN); netif_tx_unlock_bh(dev); err = 0; break; } default: err = tty_mode_ioctl(tty, cmd, arg); } sp_put(sp); return err; } static struct tty_ldisc_ops sp_ldisc = { .owner = THIS_MODULE, .num = N_6PACK, .name = "6pack", .open = sixpack_open, .close = sixpack_close, .ioctl = sixpack_ioctl, .receive_buf = sixpack_receive_buf, .write_wakeup = sixpack_write_wakeup, }; /* Initialize 6pack control device -- register 6pack line discipline */ static const char msg_banner[] __initconst = KERN_INFO \ "AX.25: 6pack driver, " SIXPACK_VERSION "\n"; static const char msg_regfail[] __initconst = KERN_ERR \ "6pack: can't register line discipline (err = %d)\n"; static int __init sixpack_init_driver(void) { int status; printk(msg_banner); /* Register the provided line protocol discipline */ status = tty_register_ldisc(&sp_ldisc); if (status) printk(msg_regfail, status); return status; } static void __exit sixpack_exit_driver(void) { tty_unregister_ldisc(&sp_ldisc); } /* encode an AX.25 packet into 6pack */ static int encode_sixpack(unsigned char *tx_buf, unsigned char *tx_buf_raw, int length, unsigned char tx_delay) { int count = 0; unsigned char checksum = 0, buf[400]; int raw_count = 0; tx_buf_raw[raw_count++] = SIXP_PRIO_CMD_MASK | SIXP_TX_MASK; tx_buf_raw[raw_count++] = SIXP_SEOF; buf[0] = tx_delay; for (count = 1; count < length; count++) buf[count] = tx_buf[count]; for (count = 0; count < length; count++) checksum += buf[count]; buf[length] = (unsigned char) 0xff - checksum; for (count = 0; count <= length; count++) { if ((count % 3) == 0) { tx_buf_raw[raw_count++] = (buf[count] & 0x3f); tx_buf_raw[raw_count] = ((buf[count] >> 2) & 0x30); } else if ((count % 3) == 1) { tx_buf_raw[raw_count++] |= (buf[count] & 0x0f); tx_buf_raw[raw_count] = ((buf[count] >> 2) & 0x3c); } else { tx_buf_raw[raw_count++] |= (buf[count] & 0x03); tx_buf_raw[raw_count++] = (buf[count] >> 2); } } if ((length % 3) != 2) raw_count++; tx_buf_raw[raw_count++] = SIXP_SEOF; return raw_count; } /* decode 4 sixpack-encoded bytes into 3 data bytes */ static void decode_data(struct sixpack *sp, unsigned char inbyte) { unsigned char *buf; if (sp->rx_count != 3) { sp->raw_buf[sp->rx_count++] = inbyte; return; } if (sp->rx_count_cooked + 2 >= sizeof(sp->cooked_buf)) { pr_err("6pack: cooked buffer overrun, data loss\n"); sp->rx_count = 0; return; } buf = sp->raw_buf; sp->cooked_buf[sp->rx_count_cooked++] = buf[0] | ((buf[1] << 2) & 0xc0); sp->cooked_buf[sp->rx_count_cooked++] = (buf[1] & 0x0f) | ((buf[2] << 2) & 0xf0); sp->cooked_buf[sp->rx_count_cooked++] = (buf[2] & 0x03) | (inbyte << 2); sp->rx_count = 0; } /* identify and execute a 6pack priority command byte */ static void decode_prio_command(struct sixpack *sp, unsigned char cmd) { int actual; if ((cmd & SIXP_PRIO_DATA_MASK) != 0) { /* idle ? */ /* RX and DCD flags can only be set in the same prio command, if the DCD flag has been set without the RX flag in the previous prio command. If DCD has not been set before, something in the transmission has gone wrong. In this case, RX and DCD are cleared in order to prevent the decode_data routine from reading further data that might be corrupt. */ if (((sp->status & SIXP_DCD_MASK) == 0) && ((cmd & SIXP_RX_DCD_MASK) == SIXP_RX_DCD_MASK)) { if (sp->status != 1) printk(KERN_DEBUG "6pack: protocol violation\n"); else sp->status = 0; cmd &= ~SIXP_RX_DCD_MASK; } sp->status = cmd & SIXP_PRIO_DATA_MASK; } else { /* output watchdog char if idle */ if ((sp->status2 != 0) && (sp->duplex == 1)) { sp->led_state = 0x70; sp->tty->ops->write(sp->tty, &sp->led_state, 1); sp->tx_enable = 1; actual = sp->tty->ops->write(sp->tty, sp->xbuff, sp->status2); sp->xleft -= actual; sp->xhead += actual; sp->led_state = 0x60; sp->status2 = 0; } } /* needed to trigger the TNC watchdog */ sp->tty->ops->write(sp->tty, &sp->led_state, 1); /* if the state byte has been received, the TNC is present, so the resync timer can be reset. */ if (sp->tnc_state == TNC_IN_SYNC) mod_timer(&sp->resync_t, jiffies + SIXP_INIT_RESYNC_TIMEOUT); sp->status1 = cmd & SIXP_PRIO_DATA_MASK; } /* identify and execute a standard 6pack command byte */ static void decode_std_command(struct sixpack *sp, unsigned char cmd) { unsigned char checksum = 0, rest = 0; short i; switch (cmd & SIXP_CMD_MASK) { /* normal command */ case SIXP_SEOF: if ((sp->rx_count == 0) && (sp->rx_count_cooked == 0)) { if ((sp->status & SIXP_RX_DCD_MASK) == SIXP_RX_DCD_MASK) { sp->led_state = 0x68; sp->tty->ops->write(sp->tty, &sp->led_state, 1); } } else { sp->led_state = 0x60; /* fill trailing bytes with zeroes */ sp->tty->ops->write(sp->tty, &sp->led_state, 1); spin_lock_bh(&sp->rxlock); rest = sp->rx_count; if (rest != 0) for (i = rest; i <= 3; i++) decode_data(sp, 0); if (rest == 2) sp->rx_count_cooked -= 2; else if (rest == 3) sp->rx_count_cooked -= 1; for (i = 0; i < sp->rx_count_cooked; i++) checksum += sp->cooked_buf[i]; if (checksum != SIXP_CHKSUM) { printk(KERN_DEBUG "6pack: bad checksum %2.2x\n", checksum); } else { sp->rcount = sp->rx_count_cooked-2; sp_bump(sp, 0); } sp->rx_count_cooked = 0; spin_unlock_bh(&sp->rxlock); } break; case SIXP_TX_URUN: printk(KERN_DEBUG "6pack: TX underrun\n"); break; case SIXP_RX_ORUN: printk(KERN_DEBUG "6pack: RX overrun\n"); break; case SIXP_RX_BUF_OVL: printk(KERN_DEBUG "6pack: RX buffer overflow\n"); } } /* decode a 6pack packet */ static void sixpack_decode(struct sixpack *sp, const unsigned char *pre_rbuff, int count) { unsigned char inbyte; int count1; for (count1 = 0; count1 < count; count1++) { inbyte = pre_rbuff[count1]; if (inbyte == SIXP_FOUND_TNC) { tnc_set_sync_state(sp, TNC_IN_SYNC); del_timer(&sp->resync_t); } if ((inbyte & SIXP_PRIO_CMD_MASK) != 0) decode_prio_command(sp, inbyte); else if ((inbyte & SIXP_STD_CMD_MASK) != 0) decode_std_command(sp, inbyte); else if ((sp->status & SIXP_RX_DCD_MASK) == SIXP_RX_DCD_MASK) { spin_lock_bh(&sp->rxlock); decode_data(sp, inbyte); spin_unlock_bh(&sp->rxlock); } } } MODULE_AUTHOR("Ralf Baechle DO1GRB <ralf@linux-mips.org>"); MODULE_DESCRIPTION("6pack driver for AX.25"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_6PACK); module_init(sixpack_init_driver); module_exit(sixpack_exit_driver);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0-or-later /* * uvc_entity.c -- USB Video Class driver * * Copyright (C) 2005-2011 * Laurent Pinchart (laurent.pinchart@ideasonboard.com) */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/videodev2.h> #include <media/v4l2-common.h> #include "uvcvideo.h" static int uvc_mc_create_links(struct uvc_video_chain *chain, struct uvc_entity *entity) { const u32 flags = MEDIA_LNK_FL_ENABLED | MEDIA_LNK_FL_IMMUTABLE; struct media_entity *sink; unsigned int i; int ret; sink = (UVC_ENTITY_TYPE(entity) == UVC_TT_STREAMING) ? (entity->vdev ? &entity->vdev->entity : NULL) : &entity->subdev.entity; if (sink == NULL) return 0; for (i = 0; i < entity->num_pads; ++i) { struct media_entity *source; struct uvc_entity *remote; u8 remote_pad; if (!(entity->pads[i].flags & MEDIA_PAD_FL_SINK)) continue; remote = uvc_entity_by_id(chain->dev, entity->baSourceID[i]); if (remote == NULL || remote->num_pads == 0) return -EINVAL; source = (UVC_ENTITY_TYPE(remote) == UVC_TT_STREAMING) ? (remote->vdev ? &remote->vdev->entity : NULL) : &remote->subdev.entity; if (source == NULL) continue; remote_pad = remote->num_pads - 1; ret = media_create_pad_link(source, remote_pad, sink, i, flags); if (ret < 0) return ret; } return 0; } static const struct v4l2_subdev_ops uvc_subdev_ops = { }; void uvc_mc_cleanup_entity(struct uvc_entity *entity) { if (UVC_ENTITY_TYPE(entity) != UVC_TT_STREAMING) media_entity_cleanup(&entity->subdev.entity); else if (entity->vdev != NULL) media_entity_cleanup(&entity->vdev->entity); } static int uvc_mc_init_entity(struct uvc_video_chain *chain, struct uvc_entity *entity) { int ret; if (UVC_ENTITY_TYPE(entity) != UVC_TT_STREAMING) { u32 function; v4l2_subdev_init(&entity->subdev, &uvc_subdev_ops); strscpy(entity->subdev.name, entity->name, sizeof(entity->subdev.name)); switch (UVC_ENTITY_TYPE(entity)) { case UVC_VC_SELECTOR_UNIT: function = MEDIA_ENT_F_VID_MUX; break; case UVC_VC_PROCESSING_UNIT: case UVC_VC_EXTENSION_UNIT: /* For lack of a better option. */ function = MEDIA_ENT_F_PROC_VIDEO_PIXEL_FORMATTER; break; case UVC_COMPOSITE_CONNECTOR: case UVC_COMPONENT_CONNECTOR: function = MEDIA_ENT_F_CONN_COMPOSITE; break; case UVC_SVIDEO_CONNECTOR: function = MEDIA_ENT_F_CONN_SVIDEO; break; case UVC_ITT_CAMERA: function = MEDIA_ENT_F_CAM_SENSOR; break; case UVC_TT_VENDOR_SPECIFIC: case UVC_ITT_VENDOR_SPECIFIC: case UVC_ITT_MEDIA_TRANSPORT_INPUT: case UVC_OTT_VENDOR_SPECIFIC: case UVC_OTT_DISPLAY: case UVC_OTT_MEDIA_TRANSPORT_OUTPUT: case UVC_EXTERNAL_VENDOR_SPECIFIC: case UVC_EXT_GPIO_UNIT: default: function = MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN; break; } entity->subdev.entity.function = function; ret = media_entity_pads_init(&entity->subdev.entity, entity->num_pads, entity->pads); if (ret < 0) return ret; ret = v4l2_device_register_subdev(&chain->dev->vdev, &entity->subdev); } else if (entity->vdev != NULL) { ret = media_entity_pads_init(&entity->vdev->entity, entity->num_pads, entity->pads); if (entity->flags & UVC_ENTITY_FLAG_DEFAULT) entity->vdev->entity.flags |= MEDIA_ENT_FL_DEFAULT; } else ret = 0; return ret; } int uvc_mc_register_entities(struct uvc_video_chain *chain) { struct uvc_entity *entity; int ret; list_for_each_entry(entity, &chain->entities, chain) { ret = uvc_mc_init_entity(chain, entity); if (ret < 0) { dev_info(&chain->dev->udev->dev, "Failed to initialize entity for entity %u\n", entity->id); return ret; } } list_for_each_entry(entity, &chain->entities, chain) { ret = uvc_mc_create_links(chain, entity); if (ret < 0) { dev_info(&chain->dev->udev->dev, "Failed to create links for entity %u\n", entity->id); return ret; } } return 0; }
82 82 36 49 25 39 41 1 2 39 48 3 30 44 24 44 27 27 20 20 6 6 14 20 96 97 77 33 32 38 20 31 41 41 41 50 26 48 50 50 42 50 41 41 46 27 2 101 81 2 50 89 2 35 2 35 78 78 48 1 67 36 78 48 3 35 8 11 4 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 // SPDX-License-Identifier: GPL-2.0-only /* * vfsv0 quota IO operations on file */ #include <linux/errno.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/dqblk_v2.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/quotaops.h> #include <asm/byteorder.h> #include "quota_tree.h" MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Quota trie support"); MODULE_LICENSE("GPL"); #define __QUOTA_QT_PARANOIA static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) { unsigned int epb = info->dqi_usable_bs >> 2; depth = info->dqi_qtree_depth - depth - 1; while (depth--) id /= epb; return id % epb; } static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth) { qid_t id = from_kqid(&init_user_ns, qid); return __get_index(info, id, depth); } /* Number of entries in one blocks */ static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) { return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader)) / info->dqi_entry_size; } static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; memset(buf, 0, info->dqi_usable_bs); return sb->s_op->quota_read(sb, info->dqi_type, buf, info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); } static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; ssize_t ret; ret = sb->s_op->quota_write(sb, info->dqi_type, buf, info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); if (ret != info->dqi_usable_bs) { quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -EIO; } return ret; } static inline int do_check_range(struct super_block *sb, const char *val_name, uint val, uint min_val, uint max_val) { if (val < min_val || val > max_val) { quota_error(sb, "Getting %s %u out of range %u-%u", val_name, val, min_val, max_val); return -EUCLEAN; } return 0; } static int check_dquot_block_header(struct qtree_mem_dqinfo *info, struct qt_disk_dqdbheader *dh) { int err = 0; err = do_check_range(info->dqi_sb, "dqdh_next_free", le32_to_cpu(dh->dqdh_next_free), 0, info->dqi_blocks - 1); if (err) return err; err = do_check_range(info->dqi_sb, "dqdh_prev_free", le32_to_cpu(dh->dqdh_prev_free), 0, info->dqi_blocks - 1); if (err) return err; err = do_check_range(info->dqi_sb, "dqdh_entries", le16_to_cpu(dh->dqdh_entries), 0, qtree_dqstr_in_blk(info)); return err; } /* Remove empty block from list and return it */ static int get_free_dqblk(struct qtree_mem_dqinfo *info) { char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int ret, blk; if (!buf) return -ENOMEM; if (info->dqi_free_blk) { blk = info->dqi_free_blk; ret = read_blk(info, blk, buf); if (ret < 0) goto out_buf; ret = check_dquot_block_header(info, dh); if (ret) goto out_buf; info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); } else { memset(buf, 0, info->dqi_usable_bs); /* Assure block allocation... */ ret = write_blk(info, info->dqi_blocks, buf); if (ret < 0) goto out_buf; blk = info->dqi_blocks++; } mark_info_dirty(info->dqi_sb, info->dqi_type); ret = blk; out_buf: kfree(buf); return ret; } /* Insert empty block to the list */ static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk) { struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int err; dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk); dh->dqdh_prev_free = cpu_to_le32(0); dh->dqdh_entries = cpu_to_le16(0); err = write_blk(info, blk, buf); if (err < 0) return err; info->dqi_free_blk = blk; mark_info_dirty(info->dqi_sb, info->dqi_type); return 0; } /* Remove given block from the list of blocks with free entries */ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; uint nextblk = le32_to_cpu(dh->dqdh_next_free); uint prevblk = le32_to_cpu(dh->dqdh_prev_free); int err; if (!tmpbuf) return -ENOMEM; if (nextblk) { err = read_blk(info, nextblk, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free; err = write_blk(info, nextblk, tmpbuf); if (err < 0) goto out_buf; } if (prevblk) { err = read_blk(info, prevblk, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free; err = write_blk(info, prevblk, tmpbuf); if (err < 0) goto out_buf; } else { info->dqi_free_entry = nextblk; mark_info_dirty(info->dqi_sb, info->dqi_type); } kfree(tmpbuf); dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); /* No matter whether write succeeds block is out of list */ if (write_blk(info, blk, buf) < 0) quota_error(info->dqi_sb, "Can't write block (%u) " "with free entries", blk); return 0; out_buf: kfree(tmpbuf); return err; } /* Insert given block to the beginning of list with free entries */ static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int err; if (!tmpbuf) return -ENOMEM; dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry); dh->dqdh_prev_free = cpu_to_le32(0); err = write_blk(info, blk, buf); if (err < 0) goto out_buf; if (info->dqi_free_entry) { err = read_blk(info, info->dqi_free_entry, tmpbuf); if (err < 0) goto out_buf; ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk); err = write_blk(info, info->dqi_free_entry, tmpbuf); if (err < 0) goto out_buf; } kfree(tmpbuf); info->dqi_free_entry = blk; mark_info_dirty(info->dqi_sb, info->dqi_type); return 0; out_buf: kfree(tmpbuf); return err; } /* Is the entry in the block free? */ int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk) { int i; for (i = 0; i < info->dqi_entry_size; i++) if (disk[i]) return 0; return 1; } EXPORT_SYMBOL(qtree_entry_unused); /* Find space for dquot */ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, int *err) { uint blk, i; struct qt_disk_dqdbheader *dh; char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); char *ddquot; *err = 0; if (!buf) { *err = -ENOMEM; return 0; } dh = (struct qt_disk_dqdbheader *)buf; if (info->dqi_free_entry) { blk = info->dqi_free_entry; *err = read_blk(info, blk, buf); if (*err < 0) goto out_buf; *err = check_dquot_block_header(info, dh); if (*err) goto out_buf; } else { blk = get_free_dqblk(info); if ((int)blk < 0) { *err = blk; kfree(buf); return 0; } memset(buf, 0, info->dqi_usable_bs); /* This is enough as the block is already zeroed and the entry * list is empty... */ info->dqi_free_entry = blk; mark_info_dirty(dquot->dq_sb, dquot->dq_id.type); } /* Block will be full? */ if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { *err = remove_free_dqentry(info, buf, blk); if (*err < 0) { quota_error(dquot->dq_sb, "Can't remove block (%u) " "from entry free list", blk); goto out_buf; } } le16_add_cpu(&dh->dqdh_entries, 1); /* Find free structure in block */ ddquot = buf + sizeof(struct qt_disk_dqdbheader); for (i = 0; i < qtree_dqstr_in_blk(info); i++) { if (qtree_entry_unused(info, ddquot)) break; ddquot += info->dqi_entry_size; } #ifdef __QUOTA_QT_PARANOIA if (i == qtree_dqstr_in_blk(info)) { quota_error(dquot->dq_sb, "Data block full but it shouldn't"); *err = -EIO; goto out_buf; } #endif *err = write_blk(info, blk, buf); if (*err < 0) { quota_error(dquot->dq_sb, "Can't write quota data block %u", blk); goto out_buf; } dquot->dq_off = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; kfree(buf); return blk; out_buf: kfree(buf); return 0; } /* Insert reference to structure into the trie */ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *treeblk, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0, newson = 0, newact = 0; __le32 *ref; uint newblk; if (!buf) return -ENOMEM; if (!*treeblk) { ret = get_free_dqblk(info); if (ret < 0) goto out_buf; *treeblk = ret; memset(buf, 0, info->dqi_usable_bs); newact = 1; } else { ret = read_blk(info, *treeblk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read tree quota " "block %u", *treeblk); goto out_buf; } } ref = (__le32 *)buf; newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); ret = do_check_range(dquot->dq_sb, "block", newblk, 0, info->dqi_blocks - 1); if (ret) goto out_buf; if (!newblk) newson = 1; if (depth == info->dqi_qtree_depth - 1) { #ifdef __QUOTA_QT_PARANOIA if (newblk) { quota_error(dquot->dq_sb, "Inserting already present " "quota entry (block %u)", le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)])); ret = -EIO; goto out_buf; } #endif newblk = find_free_dqentry(info, dquot, &ret); } else { ret = do_insert_tree(info, dquot, &newblk, depth+1); } if (newson && ret >= 0) { ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(newblk); ret = write_blk(info, *treeblk, buf); } else if (newact && ret < 0) { put_free_dqblk(info, buf, *treeblk); } out_buf: kfree(buf); return ret; } /* Wrapper for inserting quota structure into tree */ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot) { int tmp = QT_TREEOFF; #ifdef __QUOTA_QT_PARANOIA if (info->dqi_blocks <= QT_TREEOFF) { quota_error(dquot->dq_sb, "Quota tree root isn't allocated!"); return -EIO; } #endif return do_insert_tree(info, dquot, &tmp, 0); } /* * We don't have to be afraid of deadlocks as we never have quotas on quota * files... */ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; ssize_t ret; char *ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS); if (!ddquot) return -ENOMEM; /* dq_off is guarded by dqio_sem */ if (!dquot->dq_off) { ret = dq_insert_tree(info, dquot); if (ret < 0) { quota_error(sb, "Error %zd occurred while creating " "quota", ret); kfree(ddquot); return ret; } } spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->mem2disk_dqblk(ddquot, dquot); spin_unlock(&dquot->dq_dqb_lock); ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -ENOSPC; } else { ret = 0; } dqstats_inc(DQST_WRITES); kfree(ddquot); return ret; } EXPORT_SYMBOL(qtree_write_dquot); /* Free dquot entry in data block */ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { struct qt_disk_dqdbheader *dh; char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0; if (!buf) return -ENOMEM; if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { quota_error(dquot->dq_sb, "Quota structure has offset to " "other block (%u) than it should (%u)", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); ret = -EIO; goto out_buf; } ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota data block %u", blk); goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; ret = check_dquot_block_header(info, dh); if (ret) goto out_buf; le16_add_cpu(&dh->dqdh_entries, -1); if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ ret = remove_free_dqentry(info, buf, blk); if (ret >= 0) ret = put_free_dqblk(info, buf, blk); if (ret < 0) { quota_error(dquot->dq_sb, "Can't move quota data block " "(%u) to free list", blk); goto out_buf; } } else { memset(buf + (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)), 0, info->dqi_entry_size); if (le16_to_cpu(dh->dqdh_entries) == qtree_dqstr_in_blk(info) - 1) { /* Insert will write block itself */ ret = insert_free_dqentry(info, buf, blk); if (ret < 0) { quota_error(dquot->dq_sb, "Can't insert quota " "data block (%u) to free entry list", blk); goto out_buf; } } else { ret = write_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't write quota " "data block %u", blk); goto out_buf; } } } dquot->dq_off = 0; /* Quota is now unattached */ out_buf: kfree(buf); return ret; } /* Remove reference to dquot from tree */ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blk, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0; uint newblk; __le32 *ref = (__le32 *)buf; if (!buf) return -ENOMEM; ret = read_blk(info, *blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota data block %u", *blk); goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF, info->dqi_blocks - 1); if (ret) goto out_buf; if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); newblk = 0; } else { ret = remove_tree(info, dquot, &newblk, depth+1); } if (ret >= 0 && !newblk) { int i; ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0); /* Block got empty? */ for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++) ; /* Don't put the root block into the free block list */ if (i == (info->dqi_usable_bs >> 2) && *blk != QT_TREEOFF) { put_free_dqblk(info, buf, *blk); *blk = 0; } else { ret = write_blk(info, *blk, buf); if (ret < 0) quota_error(dquot->dq_sb, "Can't write quota tree block %u", *blk); } } out_buf: kfree(buf); return ret; } /* Delete dquot from tree */ int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { uint tmp = QT_TREEOFF; if (!dquot->dq_off) /* Even not allocated? */ return 0; return remove_tree(info, dquot, &tmp, 0); } EXPORT_SYMBOL(qtree_delete_dquot); /* Find entry in block */ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); loff_t ret = 0; int i; char *ddquot; if (!buf) return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota tree " "block %u", blk); goto out_buf; } ddquot = buf + sizeof(struct qt_disk_dqdbheader); for (i = 0; i < qtree_dqstr_in_blk(info); i++) { if (info->dqi_ops->is_id(ddquot, dquot)) break; ddquot += info->dqi_entry_size; } if (i == qtree_dqstr_in_blk(info)) { quota_error(dquot->dq_sb, "Quota for id %u referenced but not present", from_kqid(&init_user_ns, dquot->dq_id)); ret = -EIO; goto out_buf; } else { ret = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; } out_buf: kfree(buf); return ret; } /* Find entry for given id in the tree */ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); loff_t ret = 0; __le32 *ref = (__le32 *)buf; if (!buf) return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(dquot->dq_sb, "Can't read quota tree block %u", blk); goto out_buf; } ret = 0; blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF, info->dqi_blocks - 1); if (ret) goto out_buf; if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1); else ret = find_block_dqentry(info, dquot, blk); out_buf: kfree(buf); return ret; } /* Find entry for given id in the tree - wrapper function */ static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot) { return find_tree_dqentry(info, dquot, QT_TREEOFF, 0); } int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; loff_t offset; char *ddquot; int ret = 0; #ifdef __QUOTA_QT_PARANOIA /* Invalidated quota? */ if (!sb_dqopt(dquot->dq_sb)->files[type]) { quota_error(sb, "Quota invalidated while reading!"); return -EIO; } #endif /* Do we know offset of the dquot entry in the quota file? */ if (!dquot->dq_off) { offset = find_dqentry(info, dquot); if (offset <= 0) { /* Entry not present? */ if (offset < 0) quota_error(sb,"Can't read quota structure " "for id %u", from_kqid(&init_user_ns, dquot->dq_id)); dquot->dq_off = 0; set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); ret = offset; goto out; } dquot->dq_off = offset; } ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS); if (!ddquot) return -ENOMEM; ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { if (ret >= 0) ret = -EIO; quota_error(sb, "Error while reading quota structure for id %u", from_kqid(&init_user_ns, dquot->dq_id)); set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); kfree(ddquot); goto out; } spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->disk2mem_dqblk(dquot, ddquot); if (!dquot->dq_dqb.dqb_bhardlimit && !dquot->dq_dqb.dqb_bsoftlimit && !dquot->dq_dqb.dqb_ihardlimit && !dquot->dq_dqb.dqb_isoftlimit) set_bit(DQ_FAKE_B, &dquot->dq_flags); spin_unlock(&dquot->dq_dqb_lock); kfree(ddquot); out: dqstats_inc(DQST_READS); return ret; } EXPORT_SYMBOL(qtree_read_dquot); /* Check whether dquot should not be deleted. We know we are * the only one operating on dquot (thanks to dq_lock) */ int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) { if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) return qtree_delete_dquot(info, dquot); return 0; } EXPORT_SYMBOL(qtree_release_dquot); static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, unsigned int blk, int depth) { char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); __le32 *ref = (__le32 *)buf; ssize_t ret; unsigned int epb = info->dqi_usable_bs >> 2; unsigned int level_inc = 1; int i; if (!buf) return -ENOMEM; for (i = depth; i < info->dqi_qtree_depth - 1; i++) level_inc *= epb; ret = read_blk(info, blk, buf); if (ret < 0) { quota_error(info->dqi_sb, "Can't read quota tree block %u", blk); goto out_buf; } for (i = __get_index(info, *id, depth); i < epb; i++) { uint blk_no = le32_to_cpu(ref[i]); if (blk_no == 0) { *id += level_inc; continue; } ret = do_check_range(info->dqi_sb, "block", blk_no, 0, info->dqi_blocks - 1); if (ret) goto out_buf; if (depth == info->dqi_qtree_depth - 1) { ret = 0; goto out_buf; } ret = find_next_id(info, id, blk_no, depth + 1); if (ret != -ENOENT) break; } if (i == epb) { ret = -ENOENT; goto out_buf; } out_buf: kfree(buf); return ret; } int qtree_get_next_id(struct qtree_mem_dqinfo *info, struct kqid *qid) { qid_t id = from_kqid(&init_user_ns, *qid); int ret; ret = find_next_id(info, &id, QT_TREEOFF, 0); if (ret < 0) return ret; *qid = make_kqid(&init_user_ns, qid->type, id); return 0; } EXPORT_SYMBOL(qtree_get_next_id);
76 76 75 75 29 29 55 55 55 55 29 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 /* * net/tipc/subscr.c: TIPC network topology service * * Copyright (c) 2000-2017, Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "name_table.h" #include "subscr.h" static void tipc_sub_send_event(struct tipc_subscription *sub, struct publication *p, u32 event) { struct tipc_subscr *s = &sub->evt.s; struct tipc_event *evt = &sub->evt; if (sub->inactive) return; tipc_evt_write(evt, event, event); if (p) { tipc_evt_write(evt, found_lower, p->sr.lower); tipc_evt_write(evt, found_upper, p->sr.upper); tipc_evt_write(evt, port.ref, p->sk.ref); tipc_evt_write(evt, port.node, p->sk.node); } else { tipc_evt_write(evt, found_lower, s->seq.lower); tipc_evt_write(evt, found_upper, s->seq.upper); tipc_evt_write(evt, port.ref, 0); tipc_evt_write(evt, port.node, 0); } tipc_topsrv_queue_evt(sub->net, sub->conid, event, evt); } /** * tipc_sub_check_overlap - test for subscription overlap with the given values * @subscribed: the service range subscribed for * @found: the service range we are checking for match * * Returns true if there is overlap, otherwise false. */ static bool tipc_sub_check_overlap(struct tipc_service_range *subscribed, struct tipc_service_range *found) { u32 found_lower = found->lower; u32 found_upper = found->upper; if (found_lower < subscribed->lower) found_lower = subscribed->lower; if (found_upper > subscribed->upper) found_upper = subscribed->upper; return found_lower <= found_upper; } void tipc_sub_report_overlap(struct tipc_subscription *sub, struct publication *p, u32 event, bool must) { struct tipc_service_range *sr = &sub->s.seq; u32 filter = sub->s.filter; if (!tipc_sub_check_overlap(sr, &p->sr)) return; if (!must && !(filter & TIPC_SUB_PORTS)) return; if (filter & TIPC_SUB_CLUSTER_SCOPE && p->scope == TIPC_NODE_SCOPE) return; if (filter & TIPC_SUB_NODE_SCOPE && p->scope != TIPC_NODE_SCOPE) return; spin_lock(&sub->lock); tipc_sub_send_event(sub, p, event); spin_unlock(&sub->lock); } static void tipc_sub_timeout(struct timer_list *t) { struct tipc_subscription *sub = from_timer(sub, t, timer); spin_lock(&sub->lock); tipc_sub_send_event(sub, NULL, TIPC_SUBSCR_TIMEOUT); sub->inactive = true; spin_unlock(&sub->lock); } static void tipc_sub_kref_release(struct kref *kref) { kfree(container_of(kref, struct tipc_subscription, kref)); } void tipc_sub_put(struct tipc_subscription *subscription) { kref_put(&subscription->kref, tipc_sub_kref_release); } void tipc_sub_get(struct tipc_subscription *subscription) { kref_get(&subscription->kref); } struct tipc_subscription *tipc_sub_subscribe(struct net *net, struct tipc_subscr *s, int conid) { u32 lower = tipc_sub_read(s, seq.lower); u32 upper = tipc_sub_read(s, seq.upper); u32 filter = tipc_sub_read(s, filter); struct tipc_subscription *sub; u32 timeout; if ((filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) || lower > upper) { pr_warn("Subscription rejected, illegal request\n"); return NULL; } sub = kmalloc(sizeof(*sub), GFP_ATOMIC); if (!sub) { pr_warn("Subscription rejected, no memory\n"); return NULL; } INIT_LIST_HEAD(&sub->service_list); INIT_LIST_HEAD(&sub->sub_list); sub->net = net; sub->conid = conid; sub->inactive = false; memcpy(&sub->evt.s, s, sizeof(*s)); sub->s.seq.type = tipc_sub_read(s, seq.type); sub->s.seq.lower = lower; sub->s.seq.upper = upper; sub->s.filter = filter; sub->s.timeout = tipc_sub_read(s, timeout); memcpy(sub->s.usr_handle, s->usr_handle, 8); spin_lock_init(&sub->lock); kref_init(&sub->kref); if (!tipc_nametbl_subscribe(sub)) { kfree(sub); return NULL; } timer_setup(&sub->timer, tipc_sub_timeout, 0); timeout = tipc_sub_read(&sub->evt.s, timeout); if (timeout != TIPC_WAIT_FOREVER) mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); return sub; } void tipc_sub_unsubscribe(struct tipc_subscription *sub) { tipc_nametbl_unsubscribe(sub); if (sub->evt.s.timeout != TIPC_WAIT_FOREVER) del_timer_sync(&sub->timer); list_del(&sub->sub_list); tipc_sub_put(sub); }
309 309 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TEXT_PATCHING_H #define _ASM_X86_TEXT_PATCHING_H #include <linux/types.h> #include <linux/stddef.h> #include <asm/ptrace.h> /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. * Raise it if needed. */ #define POKE_MAX_OPCODE_SIZE 5 extern void text_poke_early(void *addr, const void *opcode, size_t len); /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. * Side-effect: any interrupt handler running between save and restore will have * the ability to write to read-only pages. * * Warning: * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * On the local CPU you need to be protected against NMI or MCE handlers seeing * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC #define RET_INSN_SIZE 1 #define RET_INSN_OPCODE 0xC3 #define CALL_INSN_SIZE 5 #define CALL_INSN_OPCODE 0xE8 #define JMP32_INSN_SIZE 5 #define JMP32_INSN_OPCODE 0xE9 #define JMP8_INSN_SIZE 2 #define JMP8_INSN_OPCODE 0xEB #define DISP32_SIZE 4 static __always_inline int text_opcode_size(u8 opcode) { int size = 0; #define __CASE(insn) \ case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break switch(opcode) { __CASE(INT3); __CASE(RET); __CASE(CALL); __CASE(JMP32); __CASE(JMP8); } #undef __CASE return size; } union text_poke_insn { u8 text[POKE_MAX_OPCODE_SIZE]; struct { u8 opcode; s32 disp; } __attribute__((packed)); }; static __always_inline void __text_gen_insn(void *buf, u8 opcode, const void *addr, const void *dest, int size) { union text_poke_insn *insn = buf; BUG_ON(size < text_opcode_size(opcode)); /* * Hide the addresses to avoid the compiler folding in constants when * referencing code, these can mess up annotations like * ANNOTATE_NOENDBR. */ OPTIMIZER_HIDE_VAR(insn); OPTIMIZER_HIDE_VAR(addr); OPTIMIZER_HIDE_VAR(dest); insn->opcode = opcode; if (size > 1) { insn->disp = (long)dest - (long)(addr + size); if (size == 2) { /* * Ensure that for JMP8 the displacement * actually fits the signed byte. */ BUG_ON((insn->disp >> 31) != (insn->disp >> 7)); } } } static __always_inline void *text_gen_insn(u8 opcode, const void *addr, const void *dest) { static union text_poke_insn insn; /* per instance */ __text_gen_insn(&insn, opcode, addr, dest, text_opcode_size(opcode)); return &insn.text; } extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; #ifndef CONFIG_UML_X86 static __always_inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) { regs->ip = ip; } static __always_inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* * The int3 handler in entry_64.S adds a gap between the * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of * this gap. See the idtentry macro's create_gap option. * * Similarly entry_32.S will have a gap on the stack for (any) hardware * exception and pt_regs; see FIXUP_FRAME. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; } static __always_inline unsigned long int3_emulate_pop(struct pt_regs *regs) { unsigned long val = *(unsigned long *)regs->sp; regs->sp += sizeof(unsigned long); return val; } static __always_inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) { int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); int3_emulate_jmp(regs, func); } static __always_inline void int3_emulate_ret(struct pt_regs *regs) { unsigned long ip = int3_emulate_pop(regs); int3_emulate_jmp(regs, ip); } static __always_inline void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp) { static const unsigned long jcc_mask[6] = { [0] = X86_EFLAGS_OF, [1] = X86_EFLAGS_CF, [2] = X86_EFLAGS_ZF, [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, [4] = X86_EFLAGS_SF, [5] = X86_EFLAGS_PF, }; bool invert = cc & 1; bool match; if (cc < 0xc) { match = regs->flags & jcc_mask[cc >> 1]; } else { match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); if (cc >= 0xe) match = match || (regs->flags & X86_EFLAGS_ZF); } if ((match && !invert) || (!match && invert)) ip += disp; int3_emulate_jmp(regs, ip); } #endif /* !CONFIG_UML_X86 */ #endif /* _ASM_X86_TEXT_PATCHING_H */
3 2 2 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 /* * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ /* Reiserfs block (de)allocator, bitmap-based. */ #include <linux/time.h> #include "reiserfs.h" #include <linux/errno.h> #include <linux/buffer_head.h> #include <linux/kernel.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #define PREALLOCATION_SIZE 9 /* different reiserfs block allocator options */ #define SB_ALLOC_OPTS(s) (REISERFS_SB(s)->s_alloc_options.bits) #define _ALLOC_concentrating_formatted_nodes 0 #define _ALLOC_displacing_large_files 1 #define _ALLOC_displacing_new_packing_localities 2 #define _ALLOC_old_hashed_relocation 3 #define _ALLOC_new_hashed_relocation 4 #define _ALLOC_skip_busy 5 #define _ALLOC_displace_based_on_dirid 6 #define _ALLOC_hashed_formatted_nodes 7 #define _ALLOC_old_way 8 #define _ALLOC_hundredth_slices 9 #define _ALLOC_dirid_groups 10 #define _ALLOC_oid_groups 11 #define _ALLOC_packing_groups 12 #define concentrating_formatted_nodes(s) test_bit(_ALLOC_concentrating_formatted_nodes, &SB_ALLOC_OPTS(s)) #define displacing_large_files(s) test_bit(_ALLOC_displacing_large_files, &SB_ALLOC_OPTS(s)) #define displacing_new_packing_localities(s) test_bit(_ALLOC_displacing_new_packing_localities, &SB_ALLOC_OPTS(s)) #define SET_OPTION(optname) \ do { \ reiserfs_info(s, "block allocator option \"%s\" is set", #optname); \ set_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)); \ } while(0) #define TEST_OPTION(optname, s) \ test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)) static inline void get_bit_address(struct super_block *s, b_blocknr_t block, unsigned int *bmap_nr, unsigned int *offset) { /* * It is in the bitmap block number equal to the block * number divided by the number of bits in a block. */ *bmap_nr = block >> (s->s_blocksize_bits + 3); /* Within that bitmap block it is located at bit offset *offset. */ *offset = block & ((s->s_blocksize << 3) - 1); } int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) { unsigned int bmap, offset; unsigned int bmap_count = reiserfs_bmap_count(s); if (block == 0 || block >= SB_BLOCK_COUNT(s)) { reiserfs_error(s, "vs-4010", "block number is out of range %lu (%u)", block, SB_BLOCK_COUNT(s)); return 0; } get_bit_address(s, block, &bmap, &offset); /* * Old format filesystem? Unlikely, but the bitmaps are all * up front so we need to account for it. */ if (unlikely(test_bit(REISERFS_OLD_FORMAT, &REISERFS_SB(s)->s_properties))) { b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1; if (block >= bmap1 && block <= bmap1 + bmap_count) { reiserfs_error(s, "vs-4019", "bitmap block %lu(%u) " "can't be freed or reused", block, bmap_count); return 0; } } else { if (offset == 0) { reiserfs_error(s, "vs-4020", "bitmap block %lu(%u) " "can't be freed or reused", block, bmap_count); return 0; } } if (bmap >= bmap_count) { reiserfs_error(s, "vs-4030", "bitmap for requested block " "is out of range: block=%lu, bitmap_nr=%u", block, bmap); return 0; } if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) { reiserfs_error(s, "vs-4050", "this is root block (%u), " "it must be busy", SB_ROOT_BLOCK(s)); return 0; } return 1; } /* * Searches in journal structures for a given block number (bmap, off). * If block is found in reiserfs journal it suggests next free block * candidate to test. */ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap, int off, int *next) { b_blocknr_t tmp; if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) { if (tmp) { /* hint supplied */ *next = tmp; PROC_INFO_INC(s, scan_bitmap.in_journal_hint); } else { (*next) = off + 1; /* inc offset to avoid looping. */ PROC_INFO_INC(s, scan_bitmap.in_journal_nohint); } PROC_INFO_INC(s, scan_bitmap.retry); return 1; } return 0; } /* * Searches for a window of zero bits with given minimum and maximum * lengths in one bitmap block */ static int scan_bitmap_block(struct reiserfs_transaction_handle *th, unsigned int bmap_n, int *beg, int boundary, int min, int max, int unfm) { struct super_block *s = th->t_super; struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n]; struct buffer_head *bh; int end, next; int org = *beg; BUG_ON(!th->t_trans_id); RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of " "range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1); PROC_INFO_INC(s, scan_bitmap.bmap); if (!bi) { reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer " "for bitmap %d", bmap_n); return 0; } bh = reiserfs_read_bitmap_block(s, bmap_n); if (bh == NULL) return 0; while (1) { cont: if (bi->free_count < min) { brelse(bh); return 0; /* No free blocks in this bitmap */ } /* search for a first zero bit -- beginning of a window */ *beg = reiserfs_find_next_zero_le_bit ((unsigned long *)(bh->b_data), boundary, *beg); /* * search for a zero bit fails or the rest of bitmap block * cannot contain a zero window of minimum size */ if (*beg + min > boundary) { brelse(bh); return 0; } if (unfm && is_block_in_journal(s, bmap_n, *beg, beg)) continue; /* first zero bit found; we check next bits */ for (end = *beg + 1;; end++) { if (end >= *beg + max || end >= boundary || reiserfs_test_le_bit(end, bh->b_data)) { next = end; break; } /* * finding the other end of zero bit window requires * looking into journal structures (in case of * searching for free blocks for unformatted nodes) */ if (unfm && is_block_in_journal(s, bmap_n, end, &next)) break; } /* * now (*beg) points to beginning of zero bits window, * (end) points to one bit after the window end */ /* found window of proper size */ if (end - *beg >= min) { int i; reiserfs_prepare_for_journal(s, bh, 1); /* * try to set all blocks used checking are * they still free */ for (i = *beg; i < end; i++) { /* Don't check in journal again. */ if (reiserfs_test_and_set_le_bit (i, bh->b_data)) { /* * bit was set by another process while * we slept in prepare_for_journal() */ PROC_INFO_INC(s, scan_bitmap.stolen); /* * we can continue with smaller set * of allocated blocks, if length of * this set is more or equal to `min' */ if (i >= *beg + min) { end = i; break; } /* * otherwise we clear all bit * were set ... */ while (--i >= *beg) reiserfs_clear_le_bit (i, bh->b_data); reiserfs_restore_prepared_buffer(s, bh); *beg = org; /* * Search again in current block * from beginning */ goto cont; } } bi->free_count -= (end - *beg); journal_mark_dirty(th, bh); brelse(bh); /* free block count calculation */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); journal_mark_dirty(th, SB_BUFFER_WITH_SB(s)); return end - (*beg); } else { *beg = next; } } } static int bmap_hash_id(struct super_block *s, u32 id) { char *hash_in = NULL; unsigned long hash; unsigned bm; if (id <= 2) { bm = 1; } else { hash_in = (char *)(&id); hash = keyed_hash(hash_in, 4); bm = hash % reiserfs_bmap_count(s); if (!bm) bm = 1; } /* this can only be true when SB_BMAP_NR = 1 */ if (bm >= reiserfs_bmap_count(s)) bm = 0; return bm; } /* * hashes the id and then returns > 0 if the block group for the * corresponding hash is full */ static inline int block_group_used(struct super_block *s, u32 id) { int bm = bmap_hash_id(s, id); struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm]; /* * If we don't have cached information on this bitmap block, we're * going to have to load it later anyway. Loading it here allows us * to make a better decision. This favors long-term performance gain * with a better on-disk layout vs. a short term gain of skipping the * read and potentially having a bad placement. */ if (info->free_count == UINT_MAX) { struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm); brelse(bh); } if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) { return 0; } return 1; } /* * the packing is returned in disk byte order */ __le32 reiserfs_choose_packing(struct inode * dir) { __le32 packing; if (TEST_OPTION(packing_groups, dir->i_sb)) { u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id); /* * some versions of reiserfsck expect packing locality 1 to be * special */ if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir)) packing = INODE_PKEY(dir)->k_objectid; else packing = INODE_PKEY(dir)->k_dir_id; } else packing = INODE_PKEY(dir)->k_objectid; return packing; } /* * Tries to find contiguous zero bit window (given size) in given region of * bitmap and place new blocks there. Returns number of allocated blocks. */ static int scan_bitmap(struct reiserfs_transaction_handle *th, b_blocknr_t * start, b_blocknr_t finish, int min, int max, int unfm, sector_t file_block) { int nr_allocated = 0; struct super_block *s = th->t_super; unsigned int bm, off; unsigned int end_bm, end_off; unsigned int off_max = s->s_blocksize << 3; BUG_ON(!th->t_trans_id); PROC_INFO_INC(s, scan_bitmap.call); /* No point in looking for more free blocks */ if (SB_FREE_BLOCKS(s) <= 0) return 0; get_bit_address(s, *start, &bm, &off); get_bit_address(s, finish, &end_bm, &end_off); if (bm > reiserfs_bmap_count(s)) return 0; if (end_bm > reiserfs_bmap_count(s)) end_bm = reiserfs_bmap_count(s); /* * When the bitmap is more than 10% free, anyone can allocate. * When it's less than 10% free, only files that already use the * bitmap are allowed. Once we pass 80% full, this restriction * is lifted. * * We do this so that files that grow later still have space close to * their original allocation. This improves locality, and presumably * performance as a result. * * This is only an allocation policy and does not make up for getting a * bad hint. Decent hinting must be implemented for this to work well. */ if (TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) { for (; bm < end_bm; bm++, off = 0) { if ((off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10) nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); if (nr_allocated) goto ret; } /* we know from above that start is a reasonable number */ get_bit_address(s, *start, &bm, &off); } for (; bm < end_bm; bm++, off = 0) { nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); if (nr_allocated) goto ret; } nr_allocated = scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); ret: *start = bm * off_max + off; return nr_allocated; } static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *inode, b_blocknr_t block, int for_unformatted) { struct super_block *s = th->t_super; struct reiserfs_super_block *rs; struct buffer_head *sbh, *bmbh; struct reiserfs_bitmap_info *apbi; unsigned int nr, offset; BUG_ON(!th->t_trans_id); PROC_INFO_INC(s, free_block); rs = SB_DISK_SUPER_BLOCK(s); sbh = SB_BUFFER_WITH_SB(s); apbi = SB_AP_BITMAP(s); get_bit_address(s, block, &nr, &offset); if (nr >= reiserfs_bmap_count(s)) { reiserfs_error(s, "vs-4075", "block %lu is out of range", block); return; } bmbh = reiserfs_read_bitmap_block(s, nr); if (!bmbh) return; reiserfs_prepare_for_journal(s, bmbh, 1); /* clear bit for the given block in bit map */ if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) { reiserfs_error(s, "vs-4080", "block %lu: bit already cleared", block); } apbi[nr].free_count++; journal_mark_dirty(th, bmbh); brelse(bmbh); reiserfs_prepare_for_journal(s, sbh, 1); /* update super block */ set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); journal_mark_dirty(th, sbh); if (for_unformatted) { int depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(inode, 1); reiserfs_write_lock_nested(s, depth); } } void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *inode, b_blocknr_t block, int for_unformatted) { struct super_block *s = th->t_super; BUG_ON(!th->t_trans_id); RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); if (!is_reusable(s, block, 1)) return; if (block > sb_block_count(REISERFS_SB(s)->s_rs)) { reiserfs_error(th->t_super, "bitmap-4072", "Trying to free block outside file system " "boundaries (%lu > %lu)", block, sb_block_count(REISERFS_SB(s)->s_rs)); return; } /* mark it before we clear it, just in case */ journal_mark_freed(th, s, block); _reiserfs_free_block(th, inode, block, for_unformatted); } /* preallocated blocks don't need to be run through journal_mark_freed */ static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th, struct inode *inode, b_blocknr_t block) { BUG_ON(!th->t_trans_id); RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device"); if (!is_reusable(th->t_super, block, 1)) return; _reiserfs_free_block(th, inode, block, 1); } static void __discard_prealloc(struct reiserfs_transaction_handle *th, struct reiserfs_inode_info *ei) { unsigned long save = ei->i_prealloc_block; int dirty = 0; struct inode *inode = &ei->vfs_inode; BUG_ON(!th->t_trans_id); #ifdef CONFIG_REISERFS_CHECK if (ei->i_prealloc_count < 0) reiserfs_error(th->t_super, "zam-4001", "inode has negative prealloc blocks count."); #endif while (ei->i_prealloc_count > 0) { b_blocknr_t block_to_free; /* * reiserfs_free_prealloc_block can drop the write lock, * which could allow another caller to free the same block. * We can protect against it by modifying the prealloc * state before calling it. */ block_to_free = ei->i_prealloc_block++; ei->i_prealloc_count--; reiserfs_free_prealloc_block(th, inode, block_to_free); dirty = 1; } if (dirty) reiserfs_update_sd(th, inode); ei->i_prealloc_block = save; list_del_init(&ei->i_prealloc_list); } /* FIXME: It should be inline function */ void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, struct inode *inode) { struct reiserfs_inode_info *ei = REISERFS_I(inode); BUG_ON(!th->t_trans_id); if (ei->i_prealloc_count) __discard_prealloc(th, ei); } void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th) { struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; BUG_ON(!th->t_trans_id); while (!list_empty(plist)) { struct reiserfs_inode_info *ei; ei = list_entry(plist->next, struct reiserfs_inode_info, i_prealloc_list); #ifdef CONFIG_REISERFS_CHECK if (!ei->i_prealloc_count) { reiserfs_error(th->t_super, "zam-4001", "inode is in prealloc list but has " "no preallocated blocks."); } #endif __discard_prealloc(th, ei); } } void reiserfs_init_alloc_options(struct super_block *s) { set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s)); set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s)); set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s)); } /* block allocator related options are parsed here */ int reiserfs_parse_alloc_options(struct super_block *s, char *options) { char *this_char, *value; /* clear default settings */ REISERFS_SB(s)->s_alloc_options.bits = 0; while ((this_char = strsep(&options, ":")) != NULL) { if ((value = strchr(this_char, '=')) != NULL) *value++ = 0; if (!strcmp(this_char, "concentrating_formatted_nodes")) { int temp; SET_OPTION(concentrating_formatted_nodes); temp = (value && *value) ? simple_strtoul(value, &value, 0) : 10; if (temp <= 0 || temp > 100) { REISERFS_SB(s)->s_alloc_options.border = 10; } else { REISERFS_SB(s)->s_alloc_options.border = 100 / temp; } continue; } if (!strcmp(this_char, "displacing_large_files")) { SET_OPTION(displacing_large_files); REISERFS_SB(s)->s_alloc_options.large_file_size = (value && *value) ? simple_strtoul(value, &value, 0) : 16; continue; } if (!strcmp(this_char, "displacing_new_packing_localities")) { SET_OPTION(displacing_new_packing_localities); continue; } if (!strcmp(this_char, "old_hashed_relocation")) { SET_OPTION(old_hashed_relocation); continue; } if (!strcmp(this_char, "new_hashed_relocation")) { SET_OPTION(new_hashed_relocation); continue; } if (!strcmp(this_char, "dirid_groups")) { SET_OPTION(dirid_groups); continue; } if (!strcmp(this_char, "oid_groups")) { SET_OPTION(oid_groups); continue; } if (!strcmp(this_char, "packing_groups")) { SET_OPTION(packing_groups); continue; } if (!strcmp(this_char, "hashed_formatted_nodes")) { SET_OPTION(hashed_formatted_nodes); continue; } if (!strcmp(this_char, "skip_busy")) { SET_OPTION(skip_busy); continue; } if (!strcmp(this_char, "hundredth_slices")) { SET_OPTION(hundredth_slices); continue; } if (!strcmp(this_char, "old_way")) { SET_OPTION(old_way); continue; } if (!strcmp(this_char, "displace_based_on_dirid")) { SET_OPTION(displace_based_on_dirid); continue; } if (!strcmp(this_char, "preallocmin")) { REISERFS_SB(s)->s_alloc_options.preallocmin = (value && *value) ? simple_strtoul(value, &value, 0) : 4; continue; } if (!strcmp(this_char, "preallocsize")) { REISERFS_SB(s)->s_alloc_options.preallocsize = (value && *value) ? simple_strtoul(value, &value, 0) : PREALLOCATION_SIZE; continue; } reiserfs_warning(s, "zam-4001", "unknown option - %s", this_char); return 1; } reiserfs_info(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); return 0; } static void print_sep(struct seq_file *seq, int *first) { if (!*first) seq_puts(seq, ":"); else *first = 0; } void show_alloc_options(struct seq_file *seq, struct super_block *s) { int first = 1; if (SB_ALLOC_OPTS(s) == ((1 << _ALLOC_skip_busy) | (1 << _ALLOC_dirid_groups) | (1 << _ALLOC_packing_groups))) return; seq_puts(seq, ",alloc="); if (TEST_OPTION(concentrating_formatted_nodes, s)) { print_sep(seq, &first); if (REISERFS_SB(s)->s_alloc_options.border != 10) { seq_printf(seq, "concentrating_formatted_nodes=%d", 100 / REISERFS_SB(s)->s_alloc_options.border); } else seq_puts(seq, "concentrating_formatted_nodes"); } if (TEST_OPTION(displacing_large_files, s)) { print_sep(seq, &first); if (REISERFS_SB(s)->s_alloc_options.large_file_size != 16) { seq_printf(seq, "displacing_large_files=%lu", REISERFS_SB(s)->s_alloc_options.large_file_size); } else seq_puts(seq, "displacing_large_files"); } if (TEST_OPTION(displacing_new_packing_localities, s)) { print_sep(seq, &first); seq_puts(seq, "displacing_new_packing_localities"); } if (TEST_OPTION(old_hashed_relocation, s)) { print_sep(seq, &first); seq_puts(seq, "old_hashed_relocation"); } if (TEST_OPTION(new_hashed_relocation, s)) { print_sep(seq, &first); seq_puts(seq, "new_hashed_relocation"); } if (TEST_OPTION(dirid_groups, s)) { print_sep(seq, &first); seq_puts(seq, "dirid_groups"); } if (TEST_OPTION(oid_groups, s)) { print_sep(seq, &first); seq_puts(seq, "oid_groups"); } if (TEST_OPTION(packing_groups, s)) { print_sep(seq, &first); seq_puts(seq, "packing_groups"); } if (TEST_OPTION(hashed_formatted_nodes, s)) { print_sep(seq, &first); seq_puts(seq, "hashed_formatted_nodes"); } if (TEST_OPTION(skip_busy, s)) { print_sep(seq, &first); seq_puts(seq, "skip_busy"); } if (TEST_OPTION(hundredth_slices, s)) { print_sep(seq, &first); seq_puts(seq, "hundredth_slices"); } if (TEST_OPTION(old_way, s)) { print_sep(seq, &first); seq_puts(seq, "old_way"); } if (TEST_OPTION(displace_based_on_dirid, s)) { print_sep(seq, &first); seq_puts(seq, "displace_based_on_dirid"); } if (REISERFS_SB(s)->s_alloc_options.preallocmin != 0) { print_sep(seq, &first); seq_printf(seq, "preallocmin=%d", REISERFS_SB(s)->s_alloc_options.preallocmin); } if (REISERFS_SB(s)->s_alloc_options.preallocsize != 17) { print_sep(seq, &first); seq_printf(seq, "preallocsize=%d", REISERFS_SB(s)->s_alloc_options.preallocsize); } } static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) { char *hash_in; if (hint->formatted_node) { hash_in = (char *)&hint->key.k_dir_id; } else { if (!hint->inode) { /*hint->search_start = hint->beg;*/ hash_in = (char *)&hint->key.k_dir_id; } else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); else hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); } hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); } /* * Relocation based on dirid, hashing them into a given bitmap block * files. Formatted nodes are unaffected, a separate policy covers them */ static void dirid_groups(reiserfs_blocknr_hint_t * hint) { unsigned long hash; __u32 dirid = 0; int bm = 0; struct super_block *sb = hint->th->t_super; if (hint->inode) dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); else if (hint->formatted_node) dirid = hint->key.k_dir_id; if (dirid) { bm = bmap_hash_id(sb, dirid); hash = bm * (sb->s_blocksize << 3); /* give a portion of the block group to metadata */ if (hint->inode) hash += sb->s_blocksize / 2; hint->search_start = hash; } } /* * Relocation based on oid, hashing them into a given bitmap block * files. Formatted nodes are unaffected, a separate policy covers them */ static void oid_groups(reiserfs_blocknr_hint_t * hint) { if (hint->inode) { unsigned long hash; __u32 oid; __u32 dirid; int bm; dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); /* * keep the root dir and it's first set of subdirs close to * the start of the disk */ if (dirid <= 2) hash = (hint->inode->i_sb->s_blocksize << 3); else { oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid); bm = bmap_hash_id(hint->inode->i_sb, oid); hash = bm * (hint->inode->i_sb->s_blocksize << 3); } hint->search_start = hash; } } /* * returns 1 if it finds an indirect item and gets valid hint info * from it, otherwise 0 */ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) { struct treepath *path; struct buffer_head *bh; struct item_head *ih; int pos_in_item; __le32 *item; int ret = 0; /* * reiserfs code can call this function w/o pointer to path * structure supplied; then we rely on supplied search_start */ if (!hint->path) return 0; path = hint->path; bh = get_last_bh(path); RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor"); ih = tp_item_head(path); pos_in_item = path->pos_in_item; item = tp_item_body(path); hint->search_start = bh->b_blocknr; /* * for indirect item: go to left and look for the first non-hole entry * in the indirect item */ if (!hint->formatted_node && is_indirect_le_ih(ih)) { if (pos_in_item == I_UNFM_NUM(ih)) pos_in_item--; while (pos_in_item >= 0) { int t = get_block_num(item, pos_in_item); if (t) { hint->search_start = t; ret = 1; break; } pos_in_item--; } } /* does result value fit into specified region? */ return ret; } /* * should be, if formatted node, then try to put on first part of the device * specified as number of percent with mount option device, else try to put * on last of device. This is not to say it is good code to do so, * but the effect should be measured. */ static inline void set_border_in_hint(struct super_block *s, reiserfs_blocknr_hint_t * hint) { b_blocknr_t border = SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border; if (hint->formatted_node) hint->end = border - 1; else hint->beg = border; } static inline void displace_large_file(reiserfs_blocknr_hint_t * hint) { if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), 4) % (hint->end - hint->beg); else hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), 4) % (hint->end - hint->beg); } static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint) { char *hash_in; if (!hint->inode) hash_in = (char *)&hint->key.k_dir_id; else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); else hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); } static inline int this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t * hint) { return hint->block == REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size; } #ifdef DISPLACE_NEW_PACKING_LOCALITIES static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint) { struct in_core_key *key = &hint->key; hint->th->displace_new_blocks = 0; hint->search_start = hint->beg + keyed_hash((char *)(&key->k_objectid), 4) % (hint->end - hint->beg); } #endif static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint) { b_blocknr_t border; u32 hash_in; if (hint->formatted_node || hint->inode == NULL) { return 0; } hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id); border = hint->beg + (u32) keyed_hash(((char *)(&hash_in)), 4) % (hint->end - hint->beg - 1); if (border > hint->search_start) hint->search_start = border; return 1; } static inline int old_way(reiserfs_blocknr_hint_t * hint) { b_blocknr_t border; if (hint->formatted_node || hint->inode == NULL) { return 0; } border = hint->beg + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - hint->beg); if (border > hint->search_start) hint->search_start = border; return 1; } static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint) { struct in_core_key *key = &hint->key; b_blocknr_t slice_start; slice_start = (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100); if (slice_start > hint->search_start || slice_start + (hint->end / 100) <= hint->search_start) { hint->search_start = slice_start; } } static void determine_search_start(reiserfs_blocknr_hint_t * hint, int amount_needed) { struct super_block *s = hint->th->t_super; int unfm_hint; hint->beg = 0; hint->end = SB_BLOCK_COUNT(s) - 1; /* This is former border algorithm. Now with tunable border offset */ if (concentrating_formatted_nodes(s)) set_border_in_hint(s, hint); #ifdef DISPLACE_NEW_PACKING_LOCALITIES /* * whenever we create a new directory, we displace it. At first * we will hash for location, later we might look for a moderately * empty place for it */ if (displacing_new_packing_localities(s) && hint->th->displace_new_blocks) { displace_new_packing_locality(hint); /* * we do not continue determine_search_start, * if new packing locality is being displaced */ return; } #endif /* * all persons should feel encouraged to add more special cases * here and test them */ if (displacing_large_files(s) && !hint->formatted_node && this_blocknr_allocation_would_make_it_a_large_file(hint)) { displace_large_file(hint); return; } /* * if none of our special cases is relevant, use the left * neighbor in the tree order of the new node we are allocating for */ if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) { hash_formatted_node(hint); return; } unfm_hint = get_left_neighbor(hint); /* * Mimic old block allocator behaviour, that is if VFS allowed for * preallocation, new blocks are displaced based on directory ID. * Also, if suggested search_start is less than last preallocated * block, we start searching from it, assuming that HDD dataflow * is faster in forward direction */ if (TEST_OPTION(old_way, s)) { if (!hint->formatted_node) { if (!reiserfs_hashed_relocation(s)) old_way(hint); else if (!reiserfs_no_unhashed_relocation(s)) old_hashed_relocation(hint); if (hint->inode && hint->search_start < REISERFS_I(hint->inode)->i_prealloc_block) hint->search_start = REISERFS_I(hint->inode)->i_prealloc_block; } return; } /* This is an approach proposed by Hans */ if (TEST_OPTION(hundredth_slices, s) && !(displacing_large_files(s) && !hint->formatted_node)) { hundredth_slices(hint); return; } /* old_hashed_relocation only works on unformatted */ if (!unfm_hint && !hint->formatted_node && TEST_OPTION(old_hashed_relocation, s)) { old_hashed_relocation(hint); } /* new_hashed_relocation works with both formatted/unformatted nodes */ if ((!unfm_hint || hint->formatted_node) && TEST_OPTION(new_hashed_relocation, s)) { new_hashed_relocation(hint); } /* dirid grouping works only on unformatted nodes */ if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) { dirid_groups(hint); } #ifdef DISPLACE_NEW_PACKING_LOCALITIES if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) { dirid_groups(hint); } #endif /* oid grouping works only on unformatted nodes */ if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) { oid_groups(hint); } return; } static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) { /* make minimum size a mount option and benchmark both ways */ /* we preallocate blocks only for regular files, specific size */ /* benchmark preallocating always and see what happens */ hint->prealloc_size = 0; if (!hint->formatted_node && hint->preallocate) { if (S_ISREG(hint->inode->i_mode) && !IS_PRIVATE(hint->inode) && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options. preallocmin * hint->inode->i_sb->s_blocksize) hint->prealloc_size = REISERFS_SB(hint->th->t_super)->s_alloc_options. preallocsize - 1; } return CARRY_ON; } static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, b_blocknr_t start, b_blocknr_t finish, int min, int amount_needed, int prealloc_size) { int rest = amount_needed; int nr_allocated; while (rest > 0 && start <= finish) { nr_allocated = scan_bitmap(hint->th, &start, finish, min, rest + prealloc_size, !hint->formatted_node, hint->block); if (nr_allocated == 0) /* no new blocks allocated, return */ break; /* fill free_blocknrs array first */ while (rest > 0 && nr_allocated > 0) { *new_blocknrs++ = start++; rest--; nr_allocated--; } /* do we have something to fill prealloc. array also ? */ if (nr_allocated > 0) { /* * it means prealloc_size was greater that 0 and * we do preallocation */ list_add(&REISERFS_I(hint->inode)->i_prealloc_list, &SB_JOURNAL(hint->th->t_super)-> j_prealloc_list); REISERFS_I(hint->inode)->i_prealloc_block = start; REISERFS_I(hint->inode)->i_prealloc_count = nr_allocated; break; } } return (amount_needed - rest); } static inline int blocknrs_and_prealloc_arrays_from_search_start (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, int amount_needed) { struct super_block *s = hint->th->t_super; b_blocknr_t start = hint->search_start; b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; int passno = 0; int nr_allocated = 0; int depth; determine_prealloc_size(hint); if (!hint->formatted_node) { int quota_ret; #ifdef REISERQUOTA_DEBUG reiserfs_debug(s, REISERFS_DEBUG_CODE, "reiserquota: allocating %d blocks id=%u", amount_needed, hint->inode->i_uid); #endif depth = reiserfs_write_unlock_nested(s); quota_ret = dquot_alloc_block_nodirty(hint->inode, amount_needed); if (quota_ret) { /* Quota exceeded? */ reiserfs_write_lock_nested(s, depth); return QUOTA_EXCEEDED; } if (hint->preallocate && hint->prealloc_size) { #ifdef REISERQUOTA_DEBUG reiserfs_debug(s, REISERFS_DEBUG_CODE, "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid); #endif quota_ret = dquot_prealloc_block_nodirty(hint->inode, hint->prealloc_size); if (quota_ret) hint->preallocate = hint->prealloc_size = 0; } /* for unformatted nodes, force large allocations */ reiserfs_write_lock_nested(s, depth); } do { switch (passno++) { case 0: /* Search from hint->search_start to end of disk */ start = hint->search_start; finish = SB_BLOCK_COUNT(s) - 1; break; case 1: /* Search from hint->beg to hint->search_start */ start = hint->beg; finish = hint->search_start; break; case 2: /* Last chance: Search from 0 to hint->beg */ start = 0; finish = hint->beg; break; default: /* We've tried searching everywhere, not enough space */ /* Free the blocks */ if (!hint->formatted_node) { #ifdef REISERQUOTA_DEBUG reiserfs_debug(s, REISERFS_DEBUG_CODE, "reiserquota: freeing (nospace) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid); #endif /* Free not allocated blocks */ depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); reiserfs_write_lock_nested(s, depth); } while (nr_allocated--) reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node); return NO_DISK_SPACE; } } while ((nr_allocated += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish, 1, amount_needed - nr_allocated, hint-> prealloc_size)) < amount_needed); if (!hint->formatted_node && amount_needed + hint->prealloc_size > nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { /* Some of preallocation blocks were not allocated */ #ifdef REISERQUOTA_DEBUG reiserfs_debug(s, REISERFS_DEBUG_CODE, "reiserquota: freeing (failed prealloc) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid); #endif depth = reiserfs_write_unlock_nested(s); dquot_free_block_nodirty(hint->inode, amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)-> i_prealloc_count); reiserfs_write_lock_nested(s, depth); } return CARRY_ON; } /* grab new blocknrs from preallocated list */ /* return amount still needed after using them */ static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, int amount_needed) { struct inode *inode = hint->inode; if (REISERFS_I(inode)->i_prealloc_count > 0) { while (amount_needed) { *new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++; REISERFS_I(inode)->i_prealloc_count--; amount_needed--; if (REISERFS_I(inode)->i_prealloc_count <= 0) { list_del(&REISERFS_I(inode)->i_prealloc_list); break; } } } /* return amount still needed after using preallocated blocks */ return amount_needed; } int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint, b_blocknr_t *new_blocknrs, int amount_needed, /* Amount of blocks we have already reserved */ int reserved_by_us) { int initial_amount_needed = amount_needed; int ret; struct super_block *s = hint->th->t_super; /* Check if there is enough space, taking into account reserved space */ if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks < amount_needed - reserved_by_us) return NO_DISK_SPACE; /* should this be if !hint->inode && hint->preallocate? */ /* do you mean hint->formatted_node can be removed ? - Zam */ /* * hint->formatted_node cannot be removed because we try to access * inode information here, and there is often no inode associated with * metadata allocations - green */ if (!hint->formatted_node && hint->preallocate) { amount_needed = use_preallocated_list_if_available (hint, new_blocknrs, amount_needed); /* * We have all the block numbers we need from the * prealloc list */ if (amount_needed == 0) return CARRY_ON; new_blocknrs += (initial_amount_needed - amount_needed); } /* find search start and save it in hint structure */ determine_search_start(hint, amount_needed); if (hint->search_start >= SB_BLOCK_COUNT(s)) hint->search_start = SB_BLOCK_COUNT(s) - 1; /* allocation itself; fill new_blocknrs and preallocation arrays */ ret = blocknrs_and_prealloc_arrays_from_search_start (hint, new_blocknrs, amount_needed); /* * We used prealloc. list to fill (partially) new_blocknrs array. * If final allocation fails we need to return blocks back to * prealloc. list or just free them. -- Zam (I chose second * variant) */ if (ret != CARRY_ON) { while (amount_needed++ < initial_amount_needed) { reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1); } } return ret; } void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info) { unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size); /* The first bit must ALWAYS be 1 */ if (!reiserfs_test_le_bit(0, (unsigned long *)bh->b_data)) reiserfs_error(sb, "reiserfs-2025", "bitmap block %lu is " "corrupted: first bit must be 1", bh->b_blocknr); info->free_count = 0; while (--cur >= (unsigned long *)bh->b_data) { /* 0 and ~0 are special, we can optimize for them */ if (*cur == 0) info->free_count += BITS_PER_LONG; else if (*cur != ~0L) /* A mix, investigate */ info->free_count += BITS_PER_LONG - hweight_long(*cur); } } struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, unsigned int bitmap) { b_blocknr_t block = (sb->s_blocksize << 3) * bitmap; struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap; struct buffer_head *bh; /* * Way old format filesystems had the bitmaps packed up front. * I doubt there are any of these left, but just in case... */ if (unlikely(test_bit(REISERFS_OLD_FORMAT, &REISERFS_SB(sb)->s_properties))) block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap; else if (bitmap == 0) block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; bh = sb_bread(sb, block); if (bh == NULL) reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " "reading failed", __func__, block); else { if (buffer_locked(bh)) { int depth; PROC_INFO_INC(sb, scan_bitmap.wait); depth = reiserfs_write_unlock_nested(sb); __wait_on_buffer(bh); reiserfs_write_lock_nested(sb, depth); } BUG_ON(!buffer_uptodate(bh)); BUG_ON(atomic_read(&bh->b_count) == 0); if (info->free_count == UINT_MAX) reiserfs_cache_bitmap_metadata(sb, bh, info); } return bh; } int reiserfs_init_bitmap_cache(struct super_block *sb) { struct reiserfs_bitmap_info *bitmap; unsigned int bmap_nr = reiserfs_bmap_count(sb); bitmap = vmalloc(array_size(bmap_nr, sizeof(*bitmap))); if (bitmap == NULL) return -ENOMEM; memset(bitmap, 0xff, sizeof(*bitmap) * bmap_nr); SB_AP_BITMAP(sb) = bitmap; return 0; } void reiserfs_free_bitmap_cache(struct super_block *sb) { if (SB_AP_BITMAP(sb)) { vfree(SB_AP_BITMAP(sb)); SB_AP_BITMAP(sb) = NULL; } }
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 // SPDX-License-Identifier: MIT #include <uapi/linux/sched/types.h> #include <drm/drm_print.h> #include <drm/drm_vblank.h> #include <drm/drm_vblank_work.h> #include <drm/drm_crtc.h> #include "drm_internal.h" /** * DOC: vblank works * * Many DRM drivers need to program hardware in a time-sensitive manner, many * times with a deadline of starting and finishing within a certain region of * the scanout. Most of the time the safest way to accomplish this is to * simply do said time-sensitive programming in the driver's IRQ handler, * which allows drivers to avoid being preempted during these critical * regions. Or even better, the hardware may even handle applying such * time-critical programming independently of the CPU. * * While there's a decent amount of hardware that's designed so that the CPU * doesn't need to be concerned with extremely time-sensitive programming, * there's a few situations where it can't be helped. Some unforgiving * hardware may require that certain time-sensitive programming be handled * completely by the CPU, and said programming may even take too long to * handle in an IRQ handler. Another such situation would be where the driver * needs to perform a task that needs to complete within a specific scanout * period, but might possibly block and thus cannot be handled in an IRQ * context. Both of these situations can't be solved perfectly in Linux since * we're not a realtime kernel, and thus the scheduler may cause us to miss * our deadline if it decides to preempt us. But for some drivers, it's good * enough if we can lower our chance of being preempted to an absolute * minimum. * * This is where &drm_vblank_work comes in. &drm_vblank_work provides a simple * generic delayed work implementation which delays work execution until a * particular vblank has passed, and then executes the work at realtime * priority. This provides the best possible chance at performing * time-sensitive hardware programming on time, even when the system is under * heavy load. &drm_vblank_work also supports rescheduling, so that self * re-arming work items can be easily implemented. */ void drm_handle_vblank_works(struct drm_vblank_crtc *vblank) { struct drm_vblank_work *work, *next; u64 count = atomic64_read(&vblank->count); bool wake = false; assert_spin_locked(&vblank->dev->event_lock); list_for_each_entry_safe(work, next, &vblank->pending_work, node) { if (!drm_vblank_passed(count, work->count)) continue; list_del_init(&work->node); drm_vblank_put(vblank->dev, vblank->pipe); kthread_queue_work(vblank->worker, &work->base); wake = true; } if (wake) wake_up_all(&vblank->work_wait_queue); } /* Handle cancelling any pending vblank work items and drop respective vblank * references in response to vblank interrupts being disabled. */ void drm_vblank_cancel_pending_works(struct drm_vblank_crtc *vblank) { struct drm_vblank_work *work, *next; assert_spin_locked(&vblank->dev->event_lock); drm_WARN_ONCE(vblank->dev, !list_empty(&vblank->pending_work), "Cancelling pending vblank works!\n"); list_for_each_entry_safe(work, next, &vblank->pending_work, node) { list_del_init(&work->node); drm_vblank_put(vblank->dev, vblank->pipe); } wake_up_all(&vblank->work_wait_queue); } /** * drm_vblank_work_schedule - schedule a vblank work * @work: vblank work to schedule * @count: target vblank count * @nextonmiss: defer until the next vblank if target vblank was missed * * Schedule @work for execution once the crtc vblank count reaches @count. * * If the crtc vblank count has already reached @count and @nextonmiss is * %false the work starts to execute immediately. * * If the crtc vblank count has already reached @count and @nextonmiss is * %true the work is deferred until the next vblank (as if @count has been * specified as crtc vblank count + 1). * * If @work is already scheduled, this function will reschedule said work * using the new @count. This can be used for self-rearming work items. * * Returns: * %1 if @work was successfully (re)scheduled, %0 if it was either already * scheduled or cancelled, or a negative error code on failure. */ int drm_vblank_work_schedule(struct drm_vblank_work *work, u64 count, bool nextonmiss) { struct drm_vblank_crtc *vblank = work->vblank; struct drm_device *dev = vblank->dev; u64 cur_vbl; unsigned long irqflags; bool passed, inmodeset, rescheduling = false, wake = false; int ret = 0; spin_lock_irqsave(&dev->event_lock, irqflags); if (work->cancelling) goto out; spin_lock(&dev->vbl_lock); inmodeset = vblank->inmodeset; spin_unlock(&dev->vbl_lock); if (inmodeset) goto out; if (list_empty(&work->node)) { ret = drm_vblank_get(dev, vblank->pipe); if (ret < 0) goto out; } else if (work->count == count) { /* Already scheduled w/ same vbl count */ goto out; } else { rescheduling = true; } work->count = count; cur_vbl = drm_vblank_count(dev, vblank->pipe); passed = drm_vblank_passed(cur_vbl, count); if (passed) drm_dbg_core(dev, "crtc %d vblank %llu already passed (current %llu)\n", vblank->pipe, count, cur_vbl); if (!nextonmiss && passed) { drm_vblank_put(dev, vblank->pipe); ret = kthread_queue_work(vblank->worker, &work->base); if (rescheduling) { list_del_init(&work->node); wake = true; } } else { if (!rescheduling) list_add_tail(&work->node, &vblank->pending_work); ret = true; } out: spin_unlock_irqrestore(&dev->event_lock, irqflags); if (wake) wake_up_all(&vblank->work_wait_queue); return ret; } EXPORT_SYMBOL(drm_vblank_work_schedule); /** * drm_vblank_work_cancel_sync - cancel a vblank work and wait for it to * finish executing * @work: vblank work to cancel * * Cancel an already scheduled vblank work and wait for its * execution to finish. * * On return, @work is guaranteed to no longer be scheduled or running, even * if it's self-arming. * * Returns: * %True if the work was cancelled before it started to execute, %false * otherwise. */ bool drm_vblank_work_cancel_sync(struct drm_vblank_work *work) { struct drm_vblank_crtc *vblank = work->vblank; struct drm_device *dev = vblank->dev; bool ret = false; spin_lock_irq(&dev->event_lock); if (!list_empty(&work->node)) { list_del_init(&work->node); drm_vblank_put(vblank->dev, vblank->pipe); ret = true; } work->cancelling++; spin_unlock_irq(&dev->event_lock); wake_up_all(&vblank->work_wait_queue); if (kthread_cancel_work_sync(&work->base)) ret = true; spin_lock_irq(&dev->event_lock); work->cancelling--; spin_unlock_irq(&dev->event_lock); return ret; } EXPORT_SYMBOL(drm_vblank_work_cancel_sync); /** * drm_vblank_work_flush - wait for a scheduled vblank work to finish * executing * @work: vblank work to flush * * Wait until @work has finished executing once. */ void drm_vblank_work_flush(struct drm_vblank_work *work) { struct drm_vblank_crtc *vblank = work->vblank; struct drm_device *dev = vblank->dev; spin_lock_irq(&dev->event_lock); wait_event_lock_irq(vblank->work_wait_queue, list_empty(&work->node), dev->event_lock); spin_unlock_irq(&dev->event_lock); kthread_flush_work(&work->base); } EXPORT_SYMBOL(drm_vblank_work_flush); /** * drm_vblank_work_init - initialize a vblank work item * @work: vblank work item * @crtc: CRTC whose vblank will trigger the work execution * @func: work function to be executed * * Initialize a vblank work item for a specific crtc. */ void drm_vblank_work_init(struct drm_vblank_work *work, struct drm_crtc *crtc, void (*func)(struct kthread_work *work)) { kthread_init_work(&work->base, func); INIT_LIST_HEAD(&work->node); work->vblank = &crtc->dev->vblank[drm_crtc_index(crtc)]; } EXPORT_SYMBOL(drm_vblank_work_init); int drm_vblank_worker_init(struct drm_vblank_crtc *vblank) { struct kthread_worker *worker; INIT_LIST_HEAD(&vblank->pending_work); init_waitqueue_head(&vblank->work_wait_queue); worker = kthread_create_worker(0, "card%d-crtc%d", vblank->dev->primary->index, vblank->pipe); if (IS_ERR(worker)) return PTR_ERR(worker); vblank->worker = worker; sched_set_fifo(worker->task); return 0; }
672 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMER_H #define _LINUX_TIMER_H #include <linux/list.h> #include <linux/ktime.h> #include <linux/stddef.h> #include <linux/debugobjects.h> #include <linux/stringify.h> #include <linux/timer_types.h> #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting the lockdep_map key * (second argument) here is required, otherwise it could be initialised to * the copy of the lockdep_map later! We use the pointer to and the string * "<file>:<line>" as the key resp. the name of the lockdep_map. */ #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) \ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(_kn, &_kn), #else #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn) #endif /** * @TIMER_DEFERRABLE: A deferrable timer will work normally when the * system is busy, but will not cause a CPU to come out of idle just * to service it; instead, the timer will be serviced when the CPU * eventually wakes up with a subsequent non-deferrable timer. * * @TIMER_IRQSAFE: An irqsafe timer is executed with IRQ disabled and * it's safe to wait for the completion of the running instance from * IRQ handlers, for example, by calling del_timer_sync(). * * Note: The irq disabled callback execution is a special case for * workqueue locking issues. It's not meant for executing random crap * with interrupts disabled. Abuse is monitored! * * @TIMER_PINNED: A pinned timer will not be affected by any timer * placement heuristics (like, NOHZ) and will always expire on the CPU * on which the timer was enqueued. * * Note: Because enqueuing of timers can migrate the timer from one * CPU to another, pinned timers are not guaranteed to stay on the * initialy selected CPU. They move to the CPU on which the enqueue * function is invoked via mod_timer() or add_timer(). If the timer * should be placed on a particular CPU, then add_timer_on() has to be * used. */ #define TIMER_CPUMASK 0x0003FFFF #define TIMER_MIGRATING 0x00040000 #define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) #define TIMER_DEFERRABLE 0x00080000 #define TIMER_PINNED 0x00100000 #define TIMER_IRQSAFE 0x00200000 #define TIMER_INIT_FLAGS (TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE) #define TIMER_ARRAYSHIFT 22 #define TIMER_ARRAYMASK 0xFFC00000 #define TIMER_TRACE_FLAGMASK (TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE) #define __TIMER_INITIALIZER(_function, _flags) { \ .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .flags = (_flags), \ __TIMER_LOCKDEP_MAP_INITIALIZER(FILE_LINE) \ } #define DEFINE_TIMER(_name, _function) \ struct timer_list _name = \ __TIMER_INITIALIZER(_function, 0) /* * LOCKDEP and DEBUG timer interfaces. */ void init_timer_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void init_timer_on_stack_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); #else static inline void init_timer_on_stack_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { init_timer_key(timer, func, flags, name, key); } #endif #ifdef CONFIG_LOCKDEP #define __init_timer(_timer, _fn, _flags) \ do { \ static struct lock_class_key __key; \ init_timer_key((_timer), (_fn), (_flags), #_timer, &__key);\ } while (0) #define __init_timer_on_stack(_timer, _fn, _flags) \ do { \ static struct lock_class_key __key; \ init_timer_on_stack_key((_timer), (_fn), (_flags), \ #_timer, &__key); \ } while (0) #else #define __init_timer(_timer, _fn, _flags) \ init_timer_key((_timer), (_fn), (_flags), NULL, NULL) #define __init_timer_on_stack(_timer, _fn, _flags) \ init_timer_on_stack_key((_timer), (_fn), (_flags), NULL, NULL) #endif /** * timer_setup - prepare a timer for first use * @timer: the timer in question * @callback: the function to call when timer expires * @flags: any TIMER_* flags * * Regular timer initialization should use either DEFINE_TIMER() above, * or timer_setup(). For timers on the stack, timer_setup_on_stack() must * be used and must be balanced with a call to destroy_timer_on_stack(). */ #define timer_setup(timer, callback, flags) \ __init_timer((timer), (callback), (flags)) #define timer_setup_on_stack(timer, callback, flags) \ __init_timer_on_stack((timer), (callback), (flags)) #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void destroy_timer_on_stack(struct timer_list *timer); #else static inline void destroy_timer_on_stack(struct timer_list *timer) { } #endif #define from_timer(var, callback_timer, timer_fieldname) \ container_of(callback_timer, typeof(*var), timer_fieldname) /** * timer_pending - is a timer pending? * @timer: the timer in question * * timer_pending will tell whether a given timer is currently pending, * or not. Callers must ensure serialization wrt. other operations done * to this timer, eg. interrupt contexts, or other CPUs on SMP. * * return value: 1 if the timer is pending, 0 if not. */ static inline int timer_pending(const struct timer_list * timer) { return !hlist_unhashed_lockless(&timer->entry); } extern void add_timer_on(struct timer_list *timer, int cpu); extern int mod_timer(struct timer_list *timer, unsigned long expires); extern int mod_timer_pending(struct timer_list *timer, unsigned long expires); extern int timer_reduce(struct timer_list *timer, unsigned long expires); /* * The jiffies value which is added to now, when there is no timer * in the timer wheel: */ #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) extern void add_timer(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int timer_delete_sync(struct timer_list *timer); extern int timer_delete(struct timer_list *timer); extern int timer_shutdown_sync(struct timer_list *timer); extern int timer_shutdown(struct timer_list *timer); /** * del_timer_sync - Delete a pending timer and wait for a running callback * @timer: The timer to be deleted * * See timer_delete_sync() for detailed explanation. * * Do not use in new code. Use timer_delete_sync() instead. */ static inline int del_timer_sync(struct timer_list *timer) { return timer_delete_sync(timer); } /** * del_timer - Delete a pending timer * @timer: The timer to be deleted * * See timer_delete() for detailed explanation. * * Do not use in new code. Use timer_delete() instead. */ static inline int del_timer(struct timer_list *timer) { return timer_delete(timer); } extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); unsigned long round_jiffies_relative(unsigned long j); unsigned long __round_jiffies_up(unsigned long j, int cpu); unsigned long __round_jiffies_up_relative(unsigned long j, int cpu); unsigned long round_jiffies_up(unsigned long j); unsigned long round_jiffies_up_relative(unsigned long j); #ifdef CONFIG_HOTPLUG_CPU int timers_prepare_cpu(unsigned int cpu); int timers_dead_cpu(unsigned int cpu); #else #define timers_prepare_cpu NULL #define timers_dead_cpu NULL #endif #endif
2 2 2 2 2 2 2 2 1 2 1 1 5 5 6 6 40 39 40 40 1 1 1 1 1 2 2 40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 // SPDX-License-Identifier: GPL-2.0-or-later /* */ #include <linux/init.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/usb/audio.h> #include <linux/usb/midi.h> #include <linux/bits.h> #include <sound/control.h> #include <sound/core.h> #include <sound/info.h> #include <sound/pcm.h> #include "usbaudio.h" #include "card.h" #include "mixer.h" #include "mixer_quirks.h" #include "midi.h" #include "midi2.h" #include "quirks.h" #include "helper.h" #include "endpoint.h" #include "pcm.h" #include "clock.h" #include "stream.h" /* * handle the quirks for the contained interfaces */ static int create_composite_quirk(struct snd_usb_audio *chip, struct usb_interface *iface, struct usb_driver *driver, const struct snd_usb_audio_quirk *quirk_comp) { int probed_ifnum = get_iface_desc(iface->altsetting)->bInterfaceNumber; const struct snd_usb_audio_quirk *quirk; int err; for (quirk = quirk_comp->data; quirk->ifnum >= 0; ++quirk) { iface = usb_ifnum_to_if(chip->dev, quirk->ifnum); if (!iface) continue; if (quirk->ifnum != probed_ifnum && usb_interface_claimed(iface)) continue; err = snd_usb_create_quirk(chip, iface, driver, quirk); if (err < 0) return err; } for (quirk = quirk_comp->data; quirk->ifnum >= 0; ++quirk) { iface = usb_ifnum_to_if(chip->dev, quirk->ifnum); if (!iface) continue; if (quirk->ifnum != probed_ifnum && !usb_interface_claimed(iface)) { err = usb_driver_claim_interface(driver, iface, USB_AUDIO_IFACE_UNUSED);