92 19 20 90 90 93 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_DEFS_H #define _ASM_X86_PGTABLE_DEFS_H #include <linux/const.h> #include <linux/mem_encrypt.h> #include <asm/page_types.h> #define _PAGE_BIT_PRESENT 0 /* is present */ #define _PAGE_BIT_RW 1 /* writeable */ #define _PAGE_BIT_USER 2 /* userspace addressable */ #define _PAGE_BIT_PWT 3 /* page write through */ #define _PAGE_BIT_PCD 4 /* page cache disabled */ #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_SOFTW1 9 /* available for programmer */ #define _PAGE_BIT_SOFTW2 10 /* " */ #define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SOFTW4 57 /* available for programmer */ #define _PAGE_BIT_SOFTW5 58 /* available for programmer */ #define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */ #define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */ #define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */ #define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 #ifdef CONFIG_X86_64 #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit */ #else /* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */ #define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit */ #endif /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) #define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) #define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1) #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3) #else #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0)) #endif #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \ _PAGE_PKEY_BIT1 | \ _PAGE_PKEY_BIT2 | \ _PAGE_PKEY_BIT3) #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED) #else #define _PAGE_KNL_ERRATUM_MASK 0 #endif #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #endif /* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict * with swap entry format. On x86 bits 1-4 are *not* involved * into swap entry computation, but bit 7 is used for thp migration, * so we borrow bit 1 for soft dirty tracking. * * Please note that this bit must be treated as swap dirty page * mark if and only if the PTE/PMD has present bit clear! */ #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY _PAGE_RW #else #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP) #define _PAGE_SWP_UFFD_WP _PAGE_USER #else #define _PAGE_UFFD_WP (_AT(pteval_t, 0)) #define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0)) #endif #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) #define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4) #else #define _PAGE_NX (_AT(pteval_t, 0)) #define _PAGE_DEVMAP (_AT(pteval_t, 0)) #define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif /* * The hardware requires shadow stack to be Write=0,Dirty=1. However, * there are valid cases where the kernel might create read-only PTEs that * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit, * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have * (Write=0,SavedDirty=1,Dirty=0) set. */ #define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY) #define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) /* * Set of bits not changed in pte_modify. The pte's * protection key is treated like _PAGE_RW, for * instance, and is *not* included in this mask since * pte_modify() does modify it. */ #define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | \ _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \ _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP) #define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT) #define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE) /* * The cache modes defined here are used to translate between pure SW usage * and the HW defined cache mode bits and/or PAT entries. * * The resulting bits for PWT, PCD and PAT should be chosen in a way * to have the WB mode at index 0 (all bits clear). This is the default * right now and likely would break too much if changed. */ #ifndef __ASSEMBLY__ enum page_cache_mode { _PAGE_CACHE_MODE_WB = 0, _PAGE_CACHE_MODE_WC = 1, _PAGE_CACHE_MODE_UC_MINUS = 2, _PAGE_CACHE_MODE_UC = 3, _PAGE_CACHE_MODE_WT = 4, _PAGE_CACHE_MODE_WP = 5, _PAGE_CACHE_MODE_NUM = 8 }; #endif #define _PAGE_CC (_AT(pteval_t, cc_mask)) #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) #define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) #define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define __PP _PAGE_PRESENT #define __RW _PAGE_RW #define _USR _PAGE_USER #define ___A _PAGE_ACCESSED #define ___D _PAGE_DIRTY #define ___G _PAGE_GLOBAL #define __NX _PAGE_NX #define _ENC _PAGE_ENC #define __WP _PAGE_CACHE_WP #define __NC _PAGE_NOCACHE #define _PSE _PAGE_PSE #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) #define __pg(x) __pgprot(x) #define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G) #define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0) #define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0) #define PAGE_COPY_NOEXEC __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_COPY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0) #define PAGE_COPY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0) #define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0) #define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) /* * Page tables needs to have Write=1 in order for any lower PTEs to be * writable. This includes shadow stack memory (Write=0, Dirty=1) */ #define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0) #define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC) #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G) #define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G) #define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP) #define __PAGE_KERNEL_IO __PAGE_KERNEL #define __PAGE_KERNEL_IO_NOCACHE __PAGE_KERNEL_NOCACHE #ifndef __ASSEMBLY__ #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _ENC) #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _ENC) #define __PAGE_KERNEL_NOENC (__PAGE_KERNEL | 0) #define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP | 0) #define __pgprot_mask(x) __pgprot((x) & __default_kernel_pte_mask) #define PAGE_KERNEL __pgprot_mask(__PAGE_KERNEL | _ENC) #define PAGE_KERNEL_NOENC __pgprot_mask(__PAGE_KERNEL | 0) #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) #define PAGE_KERNEL_ROX __pgprot_mask(__PAGE_KERNEL_ROX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) #define PAGE_KERNEL_VVAR __pgprot_mask(__PAGE_KERNEL_VVAR | _ENC) #define PAGE_KERNEL_IO __pgprot_mask(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE) #endif /* __ASSEMBLY__ */ /* * early identity mapping pte attrib macros. */ #ifdef CONFIG_X86_64 #define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC #else #define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ #define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif #ifdef CONFIG_X86_32 # include <asm/pgtable_32_types.h> #else # include <asm/pgtable_64_types.h> #endif #ifndef __ASSEMBLY__ #include <linux/types.h> /* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */ #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) /* * Extracts the flags from a (pte|pmd|pud|pgd)val_t * This includes the protection key value. */ #define PTE_FLAGS_MASK (~PTE_PFN_MASK) typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; static inline pgprot_t pgprot_nx(pgprot_t prot) { return __pgprot(pgprot_val(prot) | _PAGE_NX); } #define pgprot_nx pgprot_nx #ifdef CONFIG_X86_PAE /* * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't * use it here. */ #define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK) #define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK) /* * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries. * All other bits are Reserved MBZ */ #define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \ _PAGE_PWT | _PAGE_PCD | \ _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3) #else /* No need to mask any bits for !PAE */ #define PGD_ALLOWED_BITS (~0ULL) #endif static inline pgd_t native_make_pgd(pgdval_t val) { return (pgd_t) { val & PGD_ALLOWED_BITS }; } static inline pgdval_t native_pgd_val(pgd_t pgd) { return pgd.pgd & PGD_ALLOWED_BITS; } static inline pgdval_t pgd_flags(pgd_t pgd) { return native_pgd_val(pgd) & PTE_FLAGS_MASK; } #if CONFIG_PGTABLE_LEVELS > 4 typedef struct { p4dval_t p4d; } p4d_t; static inline p4d_t native_make_p4d(pudval_t val) { return (p4d_t) { val }; } static inline p4dval_t native_p4d_val(p4d_t p4d) { return p4d.p4d; } #else #include <asm-generic/pgtable-nop4d.h> static inline p4d_t native_make_p4d(pudval_t val) { return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; } static inline p4dval_t native_p4d_val(p4d_t p4d) { return native_pgd_val(p4d.pgd); } #endif #if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) { return (pud_t) { val }; } static inline pudval_t native_pud_val(pud_t pud) { return pud.pud; } #else #include <asm-generic/pgtable-nopud.h> static inline pud_t native_make_pud(pudval_t val) { return (pud_t) { .p4d.pgd = native_make_pgd(val) }; } static inline pudval_t native_pud_val(pud_t pud) { return native_pgd_val(pud.p4d.pgd); } #endif #if CONFIG_PGTABLE_LEVELS > 2 static inline pmd_t native_make_pmd(pmdval_t val) { return (pmd_t) { .pmd = val }; } static inline pmdval_t native_pmd_val(pmd_t pmd) { return pmd.pmd; } #else #include <asm-generic/pgtable-nopmd.h> static inline pmd_t native_make_pmd(pmdval_t val) { return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) }; } static inline pmdval_t native_pmd_val(pmd_t pmd) { return native_pgd_val(pmd.pud.p4d.pgd); } #endif static inline p4dval_t p4d_pfn_mask(p4d_t p4d) { /* No 512 GiB huge pages yet */ return PTE_PFN_MASK; } static inline p4dval_t p4d_flags_mask(p4d_t p4d) { return ~p4d_pfn_mask(p4d); } static inline p4dval_t p4d_flags(p4d_t p4d) { return native_p4d_val(p4d) & p4d_flags_mask(p4d); } static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) return PHYSICAL_PUD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pudval_t pud_flags_mask(pud_t pud) { return ~pud_pfn_mask(pud); } static inline pudval_t pud_flags(pud_t pud) { return native_pud_val(pud) & pud_flags_mask(pud); } static inline pmdval_t pmd_pfn_mask(pmd_t pmd) { if (native_pmd_val(pmd) & _PAGE_PSE) return PHYSICAL_PMD_PAGE_MASK; else return PTE_PFN_MASK; } static inline pmdval_t pmd_flags_mask(pmd_t pmd) { return ~pmd_pfn_mask(pmd); } static inline pmdval_t pmd_flags(pmd_t pmd) { return native_pmd_val(pmd) & pmd_flags_mask(pmd); } static inline pte_t native_make_pte(pteval_t val) { return (pte_t) { .pte = val }; } static inline pteval_t native_pte_val(pte_t pte) { return pte.pte; } static inline pteval_t pte_flags(pte_t pte) { return native_pte_val(pte) & PTE_FLAGS_MASK; } #define __pte2cm_idx(cb) \ ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ (((cb) >> _PAGE_BIT_PWT) & 1)) #define __cm_idx2pte(i) \ ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \ (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ (((i) & 1) << _PAGE_BIT_PWT)) unsigned long cachemode2protval(enum page_cache_mode pcm); static inline pgprotval_t protval_4k_2_large(pgprotval_t val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) { return __pgprot(protval_4k_2_large(pgprot_val(pgprot))); } static inline pgprotval_t protval_large_2_4k(pgprotval_t val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT_LARGE) >> (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) { return __pgprot(protval_large_2_4k(pgprot_val(pgprot))); } typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern pteval_t __default_kernel_pte_mask; extern void set_nx(void); extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); #define pgprot_writethrough pgprot_writethrough extern pgprot_t pgprot_writethrough(pgprot_t prot); /* Indicate that x86 has its own track and untrack pfn vma functions */ #define __HAVE_PFNMAP_TRACKING #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); #ifdef CONFIG_X86_32 extern void native_pagetable_init(void); #else #define native_pagetable_init paging_init #endif enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, PG_LEVEL_NUM }; #ifdef CONFIG_PROC_FS extern void update_page_count(int level, unsigned long pages); #else static inline void update_page_count(int level, unsigned long pages) { } #endif /* * Helper function that returns the kernel pagetable entry controlling * the virtual address 'address'. NULL means no pagetable entry present. * NOTE: the return type is pte_t but if the pmd is PSE then we return it * as a pte too. */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, unsigned int *level, bool *nx, bool *rw); extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags); extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, unsigned long numpages); #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */
88 88 89 44 42 86 88 6 6 6 6 6 27 25 25 24 24 18 5 6 23 3 3 24 27 5 5 5 5 2 3 74 73 72 70 74 70 5 71 66 5 5 49 49 70 5 46 26 27 21 9 11 27 26 27 75 73 28 48 16 74 76 75 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 3 3 3 3 2 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } return offset; } EXPORT_SYMBOL(vfs_setpos); /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { switch (whence) { case SEEK_END: offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (offset == 0) return file->f_pos; /* * f_lock protects against read/modify/write race with other * SEEK_CURs. Note that parallel writes and reads behave * like SEEK_SET. */ spin_lock(&file->f_lock); offset = vfs_setpos(file, file->f_pos + offset, maxsize); spin_unlock(&file->f_lock); return offset; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implemenation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; } retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { if (!(file->f_mode & FMODE_LSEEK)) return -ESPIPE; return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(f.file, offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } fdput_pos(f); return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; struct fd f = fdget_pos(fd); loff_t offset; if (!f.file) return -EBADF; retval = -EINVAL; if (whence > SEEK_MAX) goto out_putf; offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } out_putf: fdput_pos(f); return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { int mask = read_write == READ ? MAY_READ : MAY_WRITE; int ret; if (unlikely((ssize_t) count < 0)) return -EINVAL; if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { struct kiocb kiocb; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(f.file, buf, count, ppos); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_read(f.file, buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) ret = vfs_write(f.file, buf, count, &pos); fdput(f); } return ret; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = filp->f_op->read_iter(&kiocb, iter); else ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } else { nr = filp->f_op->write(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = file->f_op->read_iter(iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iter_read); /* * Caller is responsible for calling kiocb_end_write() on completion * if async iocb was queued. */ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; kiocb_start_write(iocb); ret = file->f_op->write_iter(iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!file->f_op->write_iter) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; file_start_write(file); ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) goto out; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, &iter, pos, READ, flags); else ret = do_loop_readv_writev(file, &iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) goto out; file_start_write(file); if (file->f_op->write_iter) ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); out: kfree(iov); return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(f.file, vec, vlen, ppos, flags); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos, *ppos = file_ppos(f.file); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(f.file, vec, vlen, ppos, flags); if (ret >= 0 && ppos) f.file->f_pos = pos; fdput_pos(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_readv(f.file, vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PWRITE) ret = vfs_writev(f.file, vec, vlen, &pos, flags); fdput(f); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct fd in, out; struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ retval = -EBADF; in = fdget(in_fd); if (!in.file) goto out; if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; if (!ppos) { pos = in.file->f_pos; } else { pos = *ppos; if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; } retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ retval = -EBADF; out = fdget(out_fd); if (!out.file) goto fput_in; if (!(out.file->f_mode & FMODE_WRITE)) goto fput_out; in_inode = file_inode(in.file); out_inode = file_inode(out.file); out_pos = out.file->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) goto fput_out; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(out.file, true); if (!opipe) { retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); } else { if (out.file->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(in.file); fsnotify_modify(out.file); out.file->f_pos = out_pos; if (ppos) *ppos = pos; else in.file->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; fput_out: fdput(out); fput_in: fdput(in); out: return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, min_t(loff_t, MAX_RW_COUNT, len), REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; } else if (samesb) { /* Fallback to splice for same sb copy for backward compat */ splice = true; } file_end_write(file_out); if (!splice) goto done; /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE * for server-side-copy between any two sb. * * In any case, we call do_splice_direct() and not splice_file_range(), * without file_start_write() held, to avoid possible deadlocks related * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, min_t(size_t, len, MAX_RW_COUNT), 0); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; struct fd f_in; struct fd f_out; ssize_t ret = -EBADF; f_in = fdget(fd_in); if (!f_in.file) goto out2; f_out = fdget(fd_out); if (!f_out.file) goto out1; ret = -EFAULT; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) goto out; } else { pos_in = f_in.file->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) goto out; } else { pos_out = f_out.file->f_pos; } ret = -EINVAL; if (flags != 0) goto out; ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { f_in.file->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { f_out.file->f_pos = pos_out; } } out: fdput(f_out); out1: fdput(f_in); out2: return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!*count) return 0; if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); } EXPORT_SYMBOL(generic_write_checks_count); /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { loff_t count = iov_iter_count(from); int ret; ret = generic_write_checks_count(iocb, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; }
65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM notifier #if !defined(_TRACE_NOTIFIERS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NOTIFIERS_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(notifier_info, TP_PROTO(void *cb), TP_ARGS(cb), TP_STRUCT__entry( __field(void *, cb) ), TP_fast_assign( __entry->cb = cb; ), TP_printk("%ps", __entry->cb) ); /* * notifier_register - called upon notifier callback registration * * @cb: callback pointer * */ DEFINE_EVENT(notifier_info, notifier_register, TP_PROTO(void *cb), TP_ARGS(cb) ); /* * notifier_unregister - called upon notifier callback unregistration * * @cb: callback pointer * */ DEFINE_EVENT(notifier_info, notifier_unregister, TP_PROTO(void *cb), TP_ARGS(cb) ); /* * notifier_run - called upon notifier callback execution * * @cb: callback pointer * */ DEFINE_EVENT(notifier_info, notifier_run, TP_PROTO(void *cb), TP_ARGS(cb) ); #endif /* _TRACE_NOTIFIERS_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
6 7 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2001 Vojtech Pavlik * Copyright (c) 2006-2007 Jiri Kosina */ /* * * Should you need to contact me, the author, you can do so either by * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail: * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic */ #ifndef __HID_H #define __HID_H #include <linux/bitops.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/mod_devicetable.h> /* hid_device_id */ #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/input.h> #include <linux/semaphore.h> #include <linux/mutex.h> #include <linux/power_supply.h> #include <uapi/linux/hid.h> #include <linux/hid_bpf.h> /* * We parse each description item into this structure. Short items data * values are expanded to 32-bit signed int, long items contain a pointer * into the data area. */ struct hid_item { unsigned format; __u8 size; __u8 type; __u8 tag; union { __u8 u8; __s8 s8; __u16 u16; __s16 s16; __u32 u32; __s32 s32; __u8 *longdata; } data; }; /* * HID report item format */ #define HID_ITEM_FORMAT_SHORT 0 #define HID_ITEM_FORMAT_LONG 1 /* * Special tag indicating long items */ #define HID_ITEM_TAG_LONG 15 /* * HID report descriptor item type (prefix bit 2,3) */ #define HID_ITEM_TYPE_MAIN 0 #define HID_ITEM_TYPE_GLOBAL 1 #define HID_ITEM_TYPE_LOCAL 2 #define HID_ITEM_TYPE_RESERVED 3 /* * HID report descriptor main item tags */ #define HID_MAIN_ITEM_TAG_INPUT 8 #define HID_MAIN_ITEM_TAG_OUTPUT 9 #define HID_MAIN_ITEM_TAG_FEATURE 11 #define HID_MAIN_ITEM_TAG_BEGIN_COLLECTION 10 #define HID_MAIN_ITEM_TAG_END_COLLECTION 12 /* * HID report descriptor main item contents */ #define HID_MAIN_ITEM_CONSTANT 0x001 #define HID_MAIN_ITEM_VARIABLE 0x002 #define HID_MAIN_ITEM_RELATIVE 0x004 #define HID_MAIN_ITEM_WRAP 0x008 #define HID_MAIN_ITEM_NONLINEAR 0x010 #define HID_MAIN_ITEM_NO_PREFERRED 0x020 #define HID_MAIN_ITEM_NULL_STATE 0x040 #define HID_MAIN_ITEM_VOLATILE 0x080 #define HID_MAIN_ITEM_BUFFERED_BYTE 0x100 /* * HID report descriptor collection item types */ #define HID_COLLECTION_PHYSICAL 0 #define HID_COLLECTION_APPLICATION 1 #define HID_COLLECTION_LOGICAL 2 #define HID_COLLECTION_NAMED_ARRAY 4 /* * HID report descriptor global item tags */ #define HID_GLOBAL_ITEM_TAG_USAGE_PAGE 0 #define HID_GLOBAL_ITEM_TAG_LOGICAL_MINIMUM 1 #define HID_GLOBAL_ITEM_TAG_LOGICAL_MAXIMUM 2 #define HID_GLOBAL_ITEM_TAG_PHYSICAL_MINIMUM 3 #define HID_GLOBAL_ITEM_TAG_PHYSICAL_MAXIMUM 4 #define HID_GLOBAL_ITEM_TAG_UNIT_EXPONENT 5 #define HID_GLOBAL_ITEM_TAG_UNIT 6 #define HID_GLOBAL_ITEM_TAG_REPORT_SIZE 7 #define HID_GLOBAL_ITEM_TAG_REPORT_ID 8 #define HID_GLOBAL_ITEM_TAG_REPORT_COUNT 9 #define HID_GLOBAL_ITEM_TAG_PUSH 10 #define HID_GLOBAL_ITEM_TAG_POP 11 /* * HID report descriptor local item tags */ #define HID_LOCAL_ITEM_TAG_USAGE 0 #define HID_LOCAL_ITEM_TAG_USAGE_MINIMUM 1 #define HID_LOCAL_ITEM_TAG_USAGE_MAXIMUM 2 #define HID_LOCAL_ITEM_TAG_DESIGNATOR_INDEX 3 #define HID_LOCAL_ITEM_TAG_DESIGNATOR_MINIMUM 4 #define HID_LOCAL_ITEM_TAG_DESIGNATOR_MAXIMUM 5 #define HID_LOCAL_ITEM_TAG_STRING_INDEX 7 #define HID_LOCAL_ITEM_TAG_STRING_MINIMUM 8 #define HID_LOCAL_ITEM_TAG_STRING_MAXIMUM 9 #define HID_LOCAL_ITEM_TAG_DELIMITER 10 /* * HID usage tables */ #define HID_USAGE_PAGE 0xffff0000 #define HID_UP_UNDEFINED 0x00000000 #define HID_UP_GENDESK 0x00010000 #define HID_UP_SIMULATION 0x00020000 #define HID_UP_GENDEVCTRLS 0x00060000 #define HID_UP_KEYBOARD 0x00070000 #define HID_UP_LED 0x00080000 #define HID_UP_BUTTON 0x00090000 #define HID_UP_ORDINAL 0x000a0000 #define HID_UP_TELEPHONY 0x000b0000 #define HID_UP_CONSUMER 0x000c0000 #define HID_UP_DIGITIZER 0x000d0000 #define HID_UP_PID 0x000f0000 #define HID_UP_BATTERY 0x00850000 #define HID_UP_CAMERA 0x00900000 #define HID_UP_HPVENDOR 0xff7f0000 #define HID_UP_HPVENDOR2 0xff010000 #define HID_UP_MSVENDOR 0xff000000 #define HID_UP_CUSTOM 0x00ff0000 #define HID_UP_LOGIVENDOR 0xffbc0000 #define HID_UP_LOGIVENDOR2 0xff090000 #define HID_UP_LOGIVENDOR3 0xff430000 #define HID_UP_LNVENDOR 0xffa00000 #define HID_UP_SENSOR 0x00200000 #define HID_UP_ASUSVENDOR 0xff310000 #define HID_UP_GOOGLEVENDOR 0xffd10000 #define HID_USAGE 0x0000ffff #define HID_GD_POINTER 0x00010001 #define HID_GD_MOUSE 0x00010002 #define HID_GD_JOYSTICK 0x00010004 #define HID_GD_GAMEPAD 0x00010005 #define HID_GD_KEYBOARD 0x00010006 #define HID_GD_KEYPAD 0x00010007 #define HID_GD_MULTIAXIS 0x00010008 /* * Microsoft Win8 Wireless Radio Controls extensions CA, see: * http://www.usb.org/developers/hidpage/HUTRR40RadioHIDUsagesFinal.pdf */ #define HID_GD_WIRELESS_RADIO_CTLS 0x0001000c /* * System Multi-Axis, see: * http://www.usb.org/developers/hidpage/HUTRR62_-_Generic_Desktop_CA_for_System_Multi-Axis_Controllers.txt */ #define HID_GD_SYSTEM_MULTIAXIS 0x0001000e #define HID_GD_X 0x00010030 #define HID_GD_Y 0x00010031 #define HID_GD_Z 0x00010032 #define HID_GD_RX 0x00010033 #define HID_GD_RY 0x00010034 #define HID_GD_RZ 0x00010035 #define HID_GD_SLIDER 0x00010036 #define HID_GD_DIAL 0x00010037 #define HID_GD_WHEEL 0x00010038 #define HID_GD_HATSWITCH 0x00010039 #define HID_GD_BUFFER 0x0001003a #define HID_GD_BYTECOUNT 0x0001003b #define HID_GD_MOTION 0x0001003c #define HID_GD_START 0x0001003d #define HID_GD_SELECT 0x0001003e #define HID_GD_VX 0x00010040 #define HID_GD_VY 0x00010041 #define HID_GD_VZ 0x00010042 #define HID_GD_VBRX 0x00010043 #define HID_GD_VBRY 0x00010044 #define HID_GD_VBRZ 0x00010045 #define HID_GD_VNO 0x00010046 #define HID_GD_FEATURE 0x00010047 #define HID_GD_RESOLUTION_MULTIPLIER 0x00010048 #define HID_GD_SYSTEM_CONTROL 0x00010080 #define HID_GD_UP 0x00010090 #define HID_GD_DOWN 0x00010091 #define HID_GD_RIGHT 0x00010092 #define HID_GD_LEFT 0x00010093 /* Microsoft Win8 Wireless Radio Controls CA usage codes */ #define HID_GD_RFKILL_BTN 0x000100c6 #define HID_GD_RFKILL_LED 0x000100c7 #define HID_GD_RFKILL_SWITCH 0x000100c8 #define HID_DC_BATTERYSTRENGTH 0x00060020 #define HID_CP_CONSUMER_CONTROL 0x000c0001 #define HID_CP_AC_PAN 0x000c0238 #define HID_DG_DIGITIZER 0x000d0001 #define HID_DG_PEN 0x000d0002 #define HID_DG_LIGHTPEN 0x000d0003 #define HID_DG_TOUCHSCREEN 0x000d0004 #define HID_DG_TOUCHPAD 0x000d0005 #define HID_DG_WHITEBOARD 0x000d0006 #define HID_DG_STYLUS 0x000d0020 #define HID_DG_PUCK 0x000d0021 #define HID_DG_FINGER 0x000d0022 #define HID_DG_TIPPRESSURE 0x000d0030 #define HID_DG_BARRELPRESSURE 0x000d0031 #define HID_DG_INRANGE 0x000d0032 #define HID_DG_TOUCH 0x000d0033 #define HID_DG_UNTOUCH 0x000d0034 #define HID_DG_TAP 0x000d0035 #define HID_DG_TRANSDUCER_INDEX 0x000d0038 #define HID_DG_TABLETFUNCTIONKEY 0x000d0039 #define HID_DG_PROGRAMCHANGEKEY 0x000d003a #define HID_DG_BATTERYSTRENGTH 0x000d003b #define HID_DG_INVERT 0x000d003c #define HID_DG_TILT_X 0x000d003d #define HID_DG_TILT_Y 0x000d003e #define HID_DG_TWIST 0x000d0041 #define HID_DG_TIPSWITCH 0x000d0042 #define HID_DG_TIPSWITCH2 0x000d0043 #define HID_DG_BARRELSWITCH 0x000d0044 #define HID_DG_ERASER 0x000d0045 #define HID_DG_TABLETPICK 0x000d0046 #define HID_DG_PEN_COLOR 0x000d005c #define HID_DG_PEN_LINE_WIDTH 0x000d005e #define HID_DG_PEN_LINE_STYLE 0x000d0070 #define HID_DG_PEN_LINE_STYLE_INK 0x000d0072 #define HID_DG_PEN_LINE_STYLE_PENCIL 0x000d0073 #define HID_DG_PEN_LINE_STYLE_HIGHLIGHTER 0x000d0074 #define HID_DG_PEN_LINE_STYLE_CHISEL_MARKER 0x000d0075 #define HID_DG_PEN_LINE_STYLE_BRUSH 0x000d0076 #define HID_DG_PEN_LINE_STYLE_NO_PREFERENCE 0x000d0077 #define HID_CP_CONSUMERCONTROL 0x000c0001 #define HID_CP_NUMERICKEYPAD 0x000c0002 #define HID_CP_PROGRAMMABLEBUTTONS 0x000c0003 #define HID_CP_MICROPHONE 0x000c0004 #define HID_CP_HEADPHONE 0x000c0005 #define HID_CP_GRAPHICEQUALIZER 0x000c0006 #define HID_CP_FUNCTIONBUTTONS 0x000c0036 #define HID_CP_SELECTION 0x000c0080 #define HID_CP_MEDIASELECTION 0x000c0087 #define HID_CP_SELECTDISC 0x000c00ba #define HID_CP_VOLUMEUP 0x000c00e9 #define HID_CP_VOLUMEDOWN 0x000c00ea #define HID_CP_PLAYBACKSPEED 0x000c00f1 #define HID_CP_PROXIMITY 0x000c0109 #define HID_CP_SPEAKERSYSTEM 0x000c0160 #define HID_CP_CHANNELLEFT 0x000c0161 #define HID_CP_CHANNELRIGHT 0x000c0162 #define HID_CP_CHANNELCENTER 0x000c0163 #define HID_CP_CHANNELFRONT 0x000c0164 #define HID_CP_CHANNELCENTERFRONT 0x000c0165 #define HID_CP_CHANNELSIDE 0x000c0166 #define HID_CP_CHANNELSURROUND 0x000c0167 #define HID_CP_CHANNELLOWFREQUENCYENHANCEMENT 0x000c0168 #define HID_CP_CHANNELTOP 0x000c0169 #define HID_CP_CHANNELUNKNOWN 0x000c016a #define HID_CP_APPLICATIONLAUNCHBUTTONS 0x000c0180 #define HID_CP_GENERICGUIAPPLICATIONCONTROLS 0x000c0200 #define HID_DG_DEVICECONFIG 0x000d000e #define HID_DG_DEVICESETTINGS 0x000d0023 #define HID_DG_AZIMUTH 0x000d003f #define HID_DG_CONFIDENCE 0x000d0047 #define HID_DG_WIDTH 0x000d0048 #define HID_DG_HEIGHT 0x000d0049 #define HID_DG_CONTACTID 0x000d0051 #define HID_DG_INPUTMODE 0x000d0052 #define HID_DG_DEVICEINDEX 0x000d0053 #define HID_DG_CONTACTCOUNT 0x000d0054 #define HID_DG_CONTACTMAX 0x000d0055 #define HID_DG_SCANTIME 0x000d0056 #define HID_DG_SURFACESWITCH 0x000d0057 #define HID_DG_BUTTONSWITCH 0x000d0058 #define HID_DG_BUTTONTYPE 0x000d0059 #define HID_DG_BARRELSWITCH2 0x000d005a #define HID_DG_TOOLSERIALNUMBER 0x000d005b #define HID_DG_LATENCYMODE 0x000d0060 #define HID_BAT_ABSOLUTESTATEOFCHARGE 0x00850065 #define HID_BAT_CHARGING 0x00850044 #define HID_VD_ASUS_CUSTOM_MEDIA_KEYS 0xff310076 /* * HID connect requests */ #define HID_CONNECT_HIDINPUT BIT(0) #define HID_CONNECT_HIDINPUT_FORCE BIT(1) #define HID_CONNECT_HIDRAW BIT(2) #define HID_CONNECT_HIDDEV BIT(3) #define HID_CONNECT_HIDDEV_FORCE BIT(4) #define HID_CONNECT_FF BIT(5) #define HID_CONNECT_DRIVER BIT(6) #define HID_CONNECT_DEFAULT (HID_CONNECT_HIDINPUT|HID_CONNECT_HIDRAW| \ HID_CONNECT_HIDDEV|HID_CONNECT_FF) /* * HID device quirks. */ /* * Increase this if you need to configure more HID quirks at module load time */ #define MAX_USBHID_BOOT_QUIRKS 4 /** * DOC: HID quirks * | @HID_QUIRK_NOTOUCH: * | @HID_QUIRK_IGNORE: ignore this device * | @HID_QUIRK_NOGET: * | @HID_QUIRK_HIDDEV_FORCE: * | @HID_QUIRK_BADPAD: * | @HID_QUIRK_MULTI_INPUT: * | @HID_QUIRK_HIDINPUT_FORCE: * | @HID_QUIRK_ALWAYS_POLL: * | @HID_QUIRK_INPUT_PER_APP: * | @HID_QUIRK_X_INVERT: * | @HID_QUIRK_Y_INVERT: * | @HID_QUIRK_SKIP_OUTPUT_REPORTS: * | @HID_QUIRK_SKIP_OUTPUT_REPORT_ID: * | @HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP: * | @HID_QUIRK_HAVE_SPECIAL_DRIVER: * | @HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE: * | @HID_QUIRK_FULLSPEED_INTERVAL: * | @HID_QUIRK_NO_INIT_REPORTS: * | @HID_QUIRK_NO_IGNORE: * | @HID_QUIRK_NO_INPUT_SYNC: */ /* BIT(0) reserved for backward compatibility, was HID_QUIRK_INVERT */ #define HID_QUIRK_NOTOUCH BIT(1) #define HID_QUIRK_IGNORE BIT(2) #define HID_QUIRK_NOGET BIT(3) #define HID_QUIRK_HIDDEV_FORCE BIT(4) #define HID_QUIRK_BADPAD BIT(5) #define HID_QUIRK_MULTI_INPUT BIT(6) #define HID_QUIRK_HIDINPUT_FORCE BIT(7) /* BIT(8) reserved for backward compatibility, was HID_QUIRK_NO_EMPTY_INPUT */ /* BIT(9) reserved for backward compatibility, was NO_INIT_INPUT_REPORTS */ #define HID_QUIRK_ALWAYS_POLL BIT(10) #define HID_QUIRK_INPUT_PER_APP BIT(11) #define HID_QUIRK_X_INVERT BIT(12) #define HID_QUIRK_Y_INVERT BIT(13) #define HID_QUIRK_SKIP_OUTPUT_REPORTS BIT(16) #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID BIT(17) #define HID_QUIRK_NO_OUTPUT_REPORTS_ON_INTR_EP BIT(18) #define HID_QUIRK_HAVE_SPECIAL_DRIVER BIT(19) #define HID_QUIRK_INCREMENT_USAGE_ON_DUPLICATE BIT(20) #define HID_QUIRK_NOINVERT BIT(21) #define HID_QUIRK_FULLSPEED_INTERVAL BIT(28) #define HID_QUIRK_NO_INIT_REPORTS BIT(29) #define HID_QUIRK_NO_IGNORE BIT(30) #define HID_QUIRK_NO_INPUT_SYNC BIT(31) /* * HID device groups * * Note: HID_GROUP_ANY is declared in linux/mod_devicetable.h * and has a value of 0x0000 */ #define HID_GROUP_GENERIC 0x0001 #define HID_GROUP_MULTITOUCH 0x0002 #define HID_GROUP_SENSOR_HUB 0x0003 #define HID_GROUP_MULTITOUCH_WIN_8 0x0004 /* * Vendor specific HID device groups */ #define HID_GROUP_RMI 0x0100 #define HID_GROUP_WACOM 0x0101 #define HID_GROUP_LOGITECH_DJ_DEVICE 0x0102 #define HID_GROUP_STEAM 0x0103 #define HID_GROUP_LOGITECH_27MHZ_DEVICE 0x0104 #define HID_GROUP_VIVALDI 0x0105 /* * HID protocol status */ #define HID_REPORT_PROTOCOL 1 #define HID_BOOT_PROTOCOL 0 /* * This is the global environment of the parser. This information is * persistent for main-items. The global environment can be saved and * restored with PUSH/POP statements. */ struct hid_global { unsigned usage_page; __s32 logical_minimum; __s32 logical_maximum; __s32 physical_minimum; __s32 physical_maximum; __s32 unit_exponent; unsigned unit; unsigned report_id; unsigned report_size; unsigned report_count; }; /* * This is the local environment. It is persistent up the next main-item. */ #define HID_MAX_USAGES 12288 #define HID_DEFAULT_NUM_COLLECTIONS 16 struct hid_local { unsigned usage[HID_MAX_USAGES]; /* usage array */ u8 usage_size[HID_MAX_USAGES]; /* usage size array */ unsigned collection_index[HID_MAX_USAGES]; /* collection index array */ unsigned usage_index; unsigned usage_minimum; unsigned delimiter_depth; unsigned delimiter_branch; }; /* * This is the collection stack. We climb up the stack to determine * application and function of each field. */ struct hid_collection { int parent_idx; /* device->collection */ unsigned type; unsigned usage; unsigned level; }; struct hid_usage { unsigned hid; /* hid usage code */ unsigned collection_index; /* index into collection array */ unsigned usage_index; /* index into usage array */ __s8 resolution_multiplier;/* Effective Resolution Multiplier (HUT v1.12, 4.3.1), default: 1 */ /* hidinput data */ __s8 wheel_factor; /* 120/resolution_multiplier */ __u16 code; /* input driver code */ __u8 type; /* input driver type */ __s16 hat_min; /* hat switch fun */ __s16 hat_max; /* ditto */ __s16 hat_dir; /* ditto */ __s16 wheel_accumulated; /* hi-res wheel */ }; struct hid_input; struct hid_field { unsigned physical; /* physical usage for this field */ unsigned logical; /* logical usage for this field */ unsigned application; /* application usage for this field */ struct hid_usage *usage; /* usage table for this function */ unsigned maxusage; /* maximum usage index */ unsigned flags; /* main-item flags (i.e. volatile,array,constant) */ unsigned report_offset; /* bit offset in the report */ unsigned report_size; /* size of this field in the report */ unsigned report_count; /* number of this field in the report */ unsigned report_type; /* (input,output,feature) */ __s32 *value; /* last known value(s) */ __s32 *new_value; /* newly read value(s) */ __s32 *usages_priorities; /* priority of each usage when reading the report * bits 8-16 are reserved for hid-input usage */ __s32 logical_minimum; __s32 logical_maximum; __s32 physical_minimum; __s32 physical_maximum; __s32 unit_exponent; unsigned unit; bool ignored; /* this field is ignored in this event */ struct hid_report *report; /* associated report */ unsigned index; /* index into report->field[] */ /* hidinput data */ struct hid_input *hidinput; /* associated input structure */ __u16 dpad; /* dpad input code */ unsigned int slot_idx; /* slot index in a report */ }; #define HID_MAX_FIELDS 256 struct hid_field_entry { struct list_head list; struct hid_field *field; unsigned int index; __s32 priority; }; struct hid_report { struct list_head list; struct list_head hidinput_list; struct list_head field_entry_list; /* ordered list of input fields */ unsigned int id; /* id of this report */ enum hid_report_type type; /* report type */ unsigned int application; /* application usage for this report */ struct hid_field *field[HID_MAX_FIELDS]; /* fields of the report */ struct hid_field_entry *field_entries; /* allocated memory of input field_entry */ unsigned maxfield; /* maximum valid field index */ unsigned size; /* size of the report (bits) */ struct hid_device *device; /* associated device */ /* tool related state */ bool tool_active; /* whether the current tool is active */ unsigned int tool; /* BTN_TOOL_* */ }; #define HID_MAX_IDS 256 struct hid_report_enum { unsigned numbered; struct list_head report_list; struct hid_report *report_id_hash[HID_MAX_IDS]; }; #define HID_MIN_BUFFER_SIZE 64 /* make sure there is at least a packet size of space */ #define HID_MAX_BUFFER_SIZE 16384 /* 16kb */ #define HID_CONTROL_FIFO_SIZE 256 /* to init devices with >100 reports */ #define HID_OUTPUT_FIFO_SIZE 64 struct hid_control_fifo { unsigned char dir; struct hid_report *report; char *raw_report; }; struct hid_output_fifo { struct hid_report *report; char *raw_report; }; #define HID_CLAIMED_INPUT BIT(0) #define HID_CLAIMED_HIDDEV BIT(1) #define HID_CLAIMED_HIDRAW BIT(2) #define HID_CLAIMED_DRIVER BIT(3) #define HID_STAT_ADDED BIT(0) #define HID_STAT_PARSED BIT(1) #define HID_STAT_DUP_DETECTED BIT(2) #define HID_STAT_REPROBED BIT(3) struct hid_input { struct list_head list; struct hid_report *report; struct input_dev *input; const char *name; struct list_head reports; /* the list of reports */ unsigned int application; /* application usage for this input */ bool registered; }; enum hid_type { HID_TYPE_OTHER = 0, HID_TYPE_USBMOUSE, HID_TYPE_USBNONE }; enum hid_battery_status { HID_BATTERY_UNKNOWN = 0, HID_BATTERY_QUERIED, /* Kernel explicitly queried battery strength */ HID_BATTERY_REPORTED, /* Device sent unsolicited battery strength report */ }; struct hid_driver; struct hid_ll_driver; struct hid_device { /* device report descriptor */ __u8 *dev_rdesc; unsigned dev_rsize; __u8 *rdesc; unsigned rsize; struct hid_collection *collection; /* List of HID collections */ unsigned collection_size; /* Number of allocated hid_collections */ unsigned maxcollection; /* Number of parsed collections */ unsigned maxapplication; /* Number of applications */ __u16 bus; /* BUS ID */ __u16 group; /* Report group */ __u32 vendor; /* Vendor ID */ __u32 product; /* Product ID */ __u32 version; /* HID version */ enum hid_type type; /* device type (mouse, kbd, ...) */ unsigned country; /* HID country */ struct hid_report_enum report_enum[HID_REPORT_TYPES]; struct work_struct led_work; /* delayed LED worker */ struct semaphore driver_input_lock; /* protects the current driver */ struct device dev; /* device */ struct hid_driver *driver; void *devres_group_id; /* ID of probe devres group */ const struct hid_ll_driver *ll_driver; struct mutex ll_open_lock; unsigned int ll_open_count; #ifdef CONFIG_HID_BATTERY_STRENGTH /* * Power supply information for HID devices which report * battery strength. power_supply was successfully registered if * battery is non-NULL. */ struct power_supply *battery; __s32 battery_capacity; __s32 battery_min; __s32 battery_max; __s32 battery_report_type; __s32 battery_report_id; __s32 battery_charge_status; enum hid_battery_status battery_status; bool battery_avoid_query; ktime_t battery_ratelimit_time; #endif unsigned long status; /* see STAT flags above */ unsigned claimed; /* Claimed by hidinput, hiddev? */ unsigned quirks; /* Various quirks the device can pull on us */ unsigned initial_quirks; /* Initial set of quirks supplied when creating device */ bool io_started; /* If IO has started */ struct list_head inputs; /* The list of inputs */ void *hiddev; /* The hiddev structure */ void *hidraw; char name[128]; /* Device name */ char phys[64]; /* Device physical location */ char uniq[64]; /* Device unique identifier (serial #) */ void *driver_data; /* temporary hid_ff handling (until moved to the drivers) */ int (*ff_init)(struct hid_device *); /* hiddev event handler */ int (*hiddev_connect)(struct hid_device *, unsigned int); void (*hiddev_disconnect)(struct hid_device *); void (*hiddev_hid_event) (struct hid_device *, struct hid_field *field, struct hid_usage *, __s32); void (*hiddev_report_event) (struct hid_device *, struct hid_report *); /* debugging support via debugfs */ unsigned short debug; struct dentry *debug_dir; struct dentry *debug_rdesc; struct dentry *debug_events; struct list_head debug_list; spinlock_t debug_list_lock; wait_queue_head_t debug_wait; struct kref ref; unsigned int id; /* system unique id */ #ifdef CONFIG_HID_BPF struct hid_bpf bpf; /* hid-bpf data */ #endif /* CONFIG_HID_BPF */ }; void hiddev_free(struct kref *ref); #define to_hid_device(pdev) \ container_of(pdev, struct hid_device, dev) static inline void *hid_get_drvdata(struct hid_device *hdev) { return dev_get_drvdata(&hdev->dev); } static inline void hid_set_drvdata(struct hid_device *hdev, void *data) { dev_set_drvdata(&hdev->dev, data); } #define HID_GLOBAL_STACK_SIZE 4 #define HID_COLLECTION_STACK_SIZE 4 #define HID_SCAN_FLAG_MT_WIN_8 BIT(0) #define HID_SCAN_FLAG_VENDOR_SPECIFIC BIT(1) #define HID_SCAN_FLAG_GD_POINTER BIT(2) struct hid_parser { struct hid_global global; struct hid_global global_stack[HID_GLOBAL_STACK_SIZE]; unsigned int global_stack_ptr; struct hid_local local; unsigned int *collection_stack; unsigned int collection_stack_ptr; unsigned int collection_stack_size; struct hid_device *device; unsigned int scan_flags; }; struct hid_class_descriptor { __u8 bDescriptorType; __le16 wDescriptorLength; } __attribute__ ((packed)); struct hid_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 bcdHID; __u8 bCountryCode; __u8 bNumDescriptors; struct hid_class_descriptor desc[1]; } __attribute__ ((packed)); #define HID_DEVICE(b, g, ven, prod) \ .bus = (b), .group = (g), .vendor = (ven), .product = (prod) #define HID_USB_DEVICE(ven, prod) \ .bus = BUS_USB, .vendor = (ven), .product = (prod) #define HID_BLUETOOTH_DEVICE(ven, prod) \ .bus = BUS_BLUETOOTH, .vendor = (ven), .product = (prod) #define HID_I2C_DEVICE(ven, prod) \ .bus = BUS_I2C, .vendor = (ven), .product = (prod) #define HID_REPORT_ID(rep) \ .report_type = (rep) #define HID_USAGE_ID(uhid, utype, ucode) \ .usage_hid = (uhid), .usage_type = (utype), .usage_code = (ucode) /* we don't want to catch types and codes equal to 0 */ #define HID_TERMINATOR (HID_ANY_ID - 1) struct hid_report_id { __u32 report_type; }; struct hid_usage_id { __u32 usage_hid; __u32 usage_type; __u32 usage_code; }; /** * struct hid_driver * @name: driver name (e.g. "Footech_bar-wheel") * @id_table: which devices is this driver for (must be non-NULL for probe * to be called) * @dyn_list: list of dynamically added device ids * @dyn_lock: lock protecting @dyn_list * @match: check if the given device is handled by this driver * @probe: new device inserted * @remove: device removed (NULL if not a hot-plug capable driver) * @report_table: on which reports to call raw_event (NULL means all) * @raw_event: if report in report_table, this hook is called (NULL means nop) * @usage_table: on which events to call event (NULL means all) * @event: if usage in usage_table, this hook is called (NULL means nop) * @report: this hook is called after parsing a report (NULL means nop) * @report_fixup: called before report descriptor parsing (NULL means nop) * @input_mapping: invoked on input registering before mapping an usage * @input_mapped: invoked on input registering after mapping an usage * @input_configured: invoked just before the device is registered * @feature_mapping: invoked on feature registering * @suspend: invoked on suspend (NULL means nop) * @resume: invoked on resume if device was not reset (NULL means nop) * @reset_resume: invoked on resume if device was reset (NULL means nop) * * probe should return -errno on error, or 0 on success. During probe, * input will not be passed to raw_event unless hid_device_io_start is * called. * * raw_event and event should return negative on error, any other value will * pass the event on to .event() typically return 0 for success. * * input_mapping shall return a negative value to completely ignore this usage * (e.g. doubled or invalid usage), zero to continue with parsing of this * usage by generic code (no special handling needed) or positive to skip * generic parsing (needed special handling which was done in the hook already) * input_mapped shall return negative to inform the layer that this usage * should not be considered for further processing or zero to notify that * no processing was performed and should be done in a generic manner * Both these functions may be NULL which means the same behavior as returning * zero from them. */ struct hid_driver { char *name; const struct hid_device_id *id_table; struct list_head dyn_list; spinlock_t dyn_lock; bool (*match)(struct hid_device *dev, bool ignore_special_driver); int (*probe)(struct hid_device *dev, const struct hid_device_id *id); void (*remove)(struct hid_device *dev); const struct hid_report_id *report_table; int (*raw_event)(struct hid_device *hdev, struct hid_report *report, u8 *data, int size); const struct hid_usage_id *usage_table; int (*event)(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value); void (*report)(struct hid_device *hdev, struct hid_report *report); __u8 *(*report_fixup)(struct hid_device *hdev, __u8 *buf, unsigned int *size); int (*input_mapping)(struct hid_device *hdev, struct hid_input *hidinput, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max); int (*input_mapped)(struct hid_device *hdev, struct hid_input *hidinput, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max); int (*input_configured)(struct hid_device *hdev, struct hid_input *hidinput); void (*feature_mapping)(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage); int (*suspend)(struct hid_device *hdev, pm_message_t message); int (*resume)(struct hid_device *hdev); int (*reset_resume)(struct hid_device *hdev); /* private: */ struct device_driver driver; }; #define to_hid_driver(pdrv) \ container_of(pdrv, struct hid_driver, driver) /** * struct hid_ll_driver - low level driver callbacks * @start: called on probe to start the device * @stop: called on remove * @open: called by input layer on open * @close: called by input layer on close * @power: request underlying hardware to enter requested power mode * @parse: this method is called only once to parse the device data, * shouldn't allocate anything to not leak memory * @request: send report request to device (e.g. feature report) * @wait: wait for buffered io to complete (send/recv reports) * @raw_request: send raw report request to device (e.g. feature report) * @output_report: send output report to device * @idle: send idle request to device * @may_wakeup: return if device may act as a wakeup source during system-suspend * @max_buffer_size: over-ride maximum data buffer size (default: HID_MAX_BUFFER_SIZE) */ struct hid_ll_driver { int (*start)(struct hid_device *hdev); void (*stop)(struct hid_device *hdev); int (*open)(struct hid_device *hdev); void (*close)(struct hid_device *hdev); int (*power)(struct hid_device *hdev, int level); int (*parse)(struct hid_device *hdev); void (*request)(struct hid_device *hdev, struct hid_report *report, int reqtype); int (*wait)(struct hid_device *hdev); int (*raw_request) (struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, unsigned char rtype, int reqtype); int (*output_report) (struct hid_device *hdev, __u8 *buf, size_t len); int (*idle)(struct hid_device *hdev, int report, int idle, int reqtype); bool (*may_wakeup)(struct hid_device *hdev); unsigned int max_buffer_size; }; extern bool hid_is_usb(const struct hid_device *hdev); #define PM_HINT_FULLON 1<<5 #define PM_HINT_NORMAL 1<<1 /* Applications from HID Usage Tables 4/8/99 Version 1.1 */ /* We ignore a few input applications that are not widely used */ #define IS_INPUT_APPLICATION(a) \ (((a >= HID_UP_GENDESK) && (a <= HID_GD_MULTIAXIS)) \ || ((a >= HID_DG_DIGITIZER) && (a <= HID_DG_WHITEBOARD)) \ || (a == HID_GD_SYSTEM_CONTROL) || (a == HID_CP_CONSUMER_CONTROL) \ || (a == HID_GD_WIRELESS_RADIO_CTLS)) /* HID core API */ extern bool hid_ignore(struct hid_device *); extern int hid_add_device(struct hid_device *); extern void hid_destroy_device(struct hid_device *); extern const struct bus_type hid_bus_type; extern int __must_check __hid_register_driver(struct hid_driver *, struct module *, const char *mod_name); /* use a define to avoid include chaining to get THIS_MODULE & friends */ #define hid_register_driver(driver) \ __hid_register_driver(driver, THIS_MODULE, KBUILD_MODNAME) extern void hid_unregister_driver(struct hid_driver *); /** * module_hid_driver() - Helper macro for registering a HID driver * @__hid_driver: hid_driver struct * * Helper macro for HID drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_hid_driver(__hid_driver) \ module_driver(__hid_driver, hid_register_driver, \ hid_unregister_driver) extern void hidinput_hid_event(struct hid_device *, struct hid_field *, struct hid_usage *, __s32); extern void hidinput_report_event(struct hid_device *hid, struct hid_report *report); extern int hidinput_connect(struct hid_device *hid, unsigned int force); extern void hidinput_disconnect(struct hid_device *); int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); void hid_output_report(struct hid_report *report, __u8 *data); int __hid_request(struct hid_device *hid, struct hid_report *rep, enum hid_class_request reqtype); u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags); struct hid_device *hid_allocate_device(void); struct hid_report *hid_register_report(struct hid_device *device, enum hid_report_type type, unsigned int id, unsigned int application); int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size); struct hid_report *hid_validate_values(struct hid_device *hid, enum hid_report_type type, unsigned int id, unsigned int field_index, unsigned int report_counts); void hid_setup_resolution_multiplier(struct hid_device *hid); int hid_open_report(struct hid_device *device); int hid_check_keys_pressed(struct hid_device *hid); int hid_connect(struct hid_device *hid, unsigned int connect_mask); void hid_disconnect(struct hid_device *hid); bool hid_match_one_id(const struct hid_device *hdev, const struct hid_device_id *id); const struct hid_device_id *hid_match_id(const struct hid_device *hdev, const struct hid_device_id *id); const struct hid_device_id *hid_match_device(struct hid_device *hdev, struct hid_driver *hdrv); bool hid_compare_device_paths(struct hid_device *hdev_a, struct hid_device *hdev_b, char separator); s32 hid_snto32(__u32 value, unsigned n); __u32 hid_field_extract(const struct hid_device *hid, __u8 *report, unsigned offset, unsigned n); #ifdef CONFIG_PM int hid_driver_suspend(struct hid_device *hdev, pm_message_t state); int hid_driver_reset_resume(struct hid_device *hdev); int hid_driver_resume(struct hid_device *hdev); #else static inline int hid_driver_suspend(struct hid_device *hdev, pm_message_t state) { return 0; } static inline int hid_driver_reset_resume(struct hid_device *hdev) { return 0; } static inline int hid_driver_resume(struct hid_device *hdev) { return 0; } #endif /** * hid_device_io_start - enable HID input during probe, remove * * @hid: the device * * This should only be called during probe or remove and only be * called by the thread calling probe or remove. It will allow * incoming packets to be delivered to the driver. */ static inline void hid_device_io_start(struct hid_device *hid) { if (hid->io_started) { dev_warn(&hid->dev, "io already started\n"); return; } hid->io_started = true; up(&hid->driver_input_lock); } /** * hid_device_io_stop - disable HID input during probe, remove * * @hid: the device * * Should only be called after hid_device_io_start. It will prevent * incoming packets from going to the driver for the duration of * probe, remove. If called during probe, packets will still go to the * driver after probe is complete. This function should only be called * by the thread calling probe or remove. */ static inline void hid_device_io_stop(struct hid_device *hid) { if (!hid->io_started) { dev_warn(&hid->dev, "io already stopped\n"); return; } hid->io_started = false; down(&hid->driver_input_lock); } /** * hid_map_usage - map usage input bits * * @hidinput: hidinput which we are interested in * @usage: usage to fill in * @bit: pointer to input->{}bit (out parameter) * @max: maximal valid usage->code to consider later (out parameter) * @type: input event type (EV_KEY, EV_REL, ...) * @c: code which corresponds to this usage and type * * The value pointed to by @bit will be set to NULL if either @type is * an unhandled event type, or if @c is out of range for @type. This * can be used as an error condition. */ static inline void hid_map_usage(struct hid_input *hidinput, struct hid_usage *usage, unsigned long **bit, int *max, __u8 type, unsigned int c) { struct input_dev *input = hidinput->input; unsigned long *bmap = NULL; unsigned int limit = 0; switch (type) { case EV_ABS: bmap = input->absbit; limit = ABS_MAX; break; case EV_REL: bmap = input->relbit; limit = REL_MAX; break; case EV_KEY: bmap = input->keybit; limit = KEY_MAX; break; case EV_LED: bmap = input->ledbit; limit = LED_MAX; break; case EV_MSC: bmap = input->mscbit; limit = MSC_MAX; break; } if (unlikely(c > limit || !bmap)) { pr_warn_ratelimited("%s: Invalid code %d type %d\n", input->name, c, type); *bit = NULL; return; } usage->type = type; usage->code = c; *max = limit; *bit = bmap; } /** * hid_map_usage_clear - map usage input bits and clear the input bit * * @hidinput: hidinput which we are interested in * @usage: usage to fill in * @bit: pointer to input->{}bit (out parameter) * @max: maximal valid usage->code to consider later (out parameter) * @type: input event type (EV_KEY, EV_REL, ...) * @c: code which corresponds to this usage and type * * The same as hid_map_usage, except the @c bit is also cleared in supported * bits (@bit). */ static inline void hid_map_usage_clear(struct hid_input *hidinput, struct hid_usage *usage, unsigned long **bit, int *max, __u8 type, __u16 c) { hid_map_usage(hidinput, usage, bit, max, type, c); if (*bit) clear_bit(usage->code, *bit); } /** * hid_parse - parse HW reports * * @hdev: hid device * * Call this from probe after you set up the device (if needed). Your * report_fixup will be called (if non-NULL) after reading raw report from * device before passing it to hid layer for real parsing. */ static inline int __must_check hid_parse(struct hid_device *hdev) { return hid_open_report(hdev); } int __must_check hid_hw_start(struct hid_device *hdev, unsigned int connect_mask); void hid_hw_stop(struct hid_device *hdev); int __must_check hid_hw_open(struct hid_device *hdev); void hid_hw_close(struct hid_device *hdev); void hid_hw_request(struct hid_device *hdev, struct hid_report *report, enum hid_class_request reqtype); int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype); int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len); /** * hid_hw_power - requests underlying HW to go into given power mode * * @hdev: hid device * @level: requested power level (one of %PM_HINT_* defines) * * This function requests underlying hardware to enter requested power * mode. */ static inline int hid_hw_power(struct hid_device *hdev, int level) { return hdev->ll_driver->power ? hdev->ll_driver->power(hdev, level) : 0; } /** * hid_hw_idle - send idle request to device * * @hdev: hid device * @report: report to control * @idle: idle state * @reqtype: hid request type */ static inline int hid_hw_idle(struct hid_device *hdev, int report, int idle, enum hid_class_request reqtype) { if (hdev->ll_driver->idle) return hdev->ll_driver->idle(hdev, report, idle, reqtype); return 0; } /** * hid_hw_may_wakeup - return if the hid device may act as a wakeup source during system-suspend * * @hdev: hid device */ static inline bool hid_hw_may_wakeup(struct hid_device *hdev) { if (hdev->ll_driver->may_wakeup) return hdev->ll_driver->may_wakeup(hdev); if (hdev->dev.parent) return device_may_wakeup(hdev->dev.parent); return false; } /** * hid_hw_wait - wait for buffered io to complete * * @hdev: hid device */ static inline void hid_hw_wait(struct hid_device *hdev) { if (hdev->ll_driver->wait) hdev->ll_driver->wait(hdev); } /** * hid_report_len - calculate the report length * * @report: the report we want to know the length */ static inline u32 hid_report_len(struct hid_report *report) { return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); /* HID quirks API */ unsigned long hid_lookup_quirk(const struct hid_device *hdev); int hid_quirks_init(char **quirks_param, __u16 bus, int count); void hid_quirks_exit(__u16 bus); #ifdef CONFIG_HID_PID int hid_pidff_init(struct hid_device *hid); #else #define hid_pidff_init NULL #endif #define dbg_hid(fmt, ...) pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__) #define hid_err(hid, fmt, ...) \ dev_err(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_notice(hid, fmt, ...) \ dev_notice(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_warn(hid, fmt, ...) \ dev_warn(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_info(hid, fmt, ...) \ dev_info(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_dbg(hid, fmt, ...) \ dev_dbg(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_err_once(hid, fmt, ...) \ dev_err_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_notice_once(hid, fmt, ...) \ dev_notice_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_warn_once(hid, fmt, ...) \ dev_warn_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_info_once(hid, fmt, ...) \ dev_info_once(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_dbg_once(hid, fmt, ...) \ dev_dbg_once(&(hid)->dev, fmt, ##__VA_ARGS__) #endif
6 127 191 4 191 4 224 30 140 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BITOPS_H #define _ASM_X86_BITOPS_H /* * Copyright 1992, Linus Torvalds. * * Note: inlines with more than a single statement should be marked * __always_inline to avoid problems with older gcc's inlining heuristics. */ #ifndef _LINUX_BITOPS_H #error only <linux/bitops.h> can be included directly #endif #include <linux/compiler.h> #include <asm/alternative.h> #include <asm/rmwcc.h> #include <asm/barrier.h> #if BITS_PER_LONG == 32 # define _BITOPS_LONG_SHIFT 5 #elif BITS_PER_LONG == 64 # define _BITOPS_LONG_SHIFT 6 #else # error "Unexpected BITS_PER_LONG" #endif #define BIT_64(n) (U64_C(1) << (n)) /* * These have to be done with inline assembly: that way the bit-setting * is guaranteed to be atomic. All bit operations return 0 if the bit * was cleared before the operation and != 0 if it was not. * * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ #define RLONG_ADDR(x) "m" (*(volatile long *) (x)) #define WBYTE_ADDR(x) "+m" (*(volatile char *) (x)) #define ADDR RLONG_ADDR(addr) /* * We do the locked ops that don't return the old value as * a mask operation on a byte. */ #define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3)) #define CONST_MASK(nr) (1 << ((nr) & 7)) static __always_inline void arch_set_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "orb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (CONST_MASK(nr)) : "memory"); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } static __always_inline void arch___set_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline void arch_clear_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "andb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (~CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } static __always_inline void arch_clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); arch_clear_bit(nr, addr); } static __always_inline void arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, volatile unsigned long *addr) { bool negative; asm volatile(LOCK_PREFIX "xorb %2,%1" CC_SET(s) : CC_OUT(s) (negative), WBYTE_ADDR(addr) : "iq" ((char)mask) : "memory"); return negative; } #define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte static __always_inline void arch___clear_bit_unlock(long nr, volatile unsigned long *addr) { arch___clear_bit(nr, addr); } static __always_inline void arch___change_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline void arch_change_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "xorb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } static __always_inline bool arch_test_and_set_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr); } static __always_inline bool arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr) { return arch_test_and_set_bit(nr, addr); } static __always_inline bool arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; asm(__ASM_SIZE(bts) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : ADDR, "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch_test_and_clear_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr); } /* * Note: the operation is performed atomically with respect to * the local CPU, but not other CPUs. Portable code should not * rely on this behaviour. * KVM relies on this behaviour on x86 for modifying memory that is also * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ static __always_inline bool arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; asm volatile(__ASM_SIZE(btr) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : ADDR, "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; asm volatile(__ASM_SIZE(btc) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : ADDR, "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch_test_and_change_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr); } static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) { return ((1UL << (nr & (BITS_PER_LONG-1))) & (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr) { bool oldbit; asm volatile("testb %2,%1" CC_SET(nz) : CC_OUT(nz) (oldbit) : "m" (((unsigned char *)addr)[nr >> 3]), "i" (1 << (nr & 7)) :"memory"); return oldbit; } static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr) { bool oldbit; asm volatile(__ASM_SIZE(bt) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch_test_bit(unsigned long nr, const volatile unsigned long *addr) { return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) : variable_test_bit(nr, addr); } static __always_inline bool arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) { return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) : variable_test_bit(nr, addr); } static __always_inline unsigned long variable__ffs(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) : ASM_INPUT_RM (word)); return word; } /** * __ffs - find first set bit in word * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ #define __ffs(word) \ (__builtin_constant_p(word) ? \ (unsigned long)__builtin_ctzl(word) : \ variable__ffs(word)) static __always_inline unsigned long variable_ffz(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) : "r" (~word)); return word; } /** * ffz - find first zero bit in word * @word: The word to search * * Undefined if no zero exists, so code should check against ~0UL first. */ #define ffz(word) \ (__builtin_constant_p(word) ? \ (unsigned long)__builtin_ctzl(~word) : \ variable_ffz(word)) /* * __fls: find last set bit in word * @word: The word to search * * Undefined if no set bit exists, so code should check against 0 first. */ static __always_inline unsigned long __fls(unsigned long word) { if (__builtin_constant_p(word)) return BITS_PER_LONG - 1 - __builtin_clzl(word); asm("bsr %1,%0" : "=r" (word) : ASM_INPUT_RM (word)); return word; } #undef ADDR #ifdef __KERNEL__ static __always_inline int variable_ffs(int x) { int r; #ifdef CONFIG_X86_64 /* * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before, except that the * top 32 bits will be cleared. * * We cannot do this on 32 bits because at the very least some * 486 CPUs did not behave this way. */ asm("bsfl %1,%0" : "=r" (r) : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsfl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "r" (-1)); #else asm("bsfl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); #endif return r + 1; } /** * ffs - find first set bit in word * @x: the word to search * * This is defined the same way as the libc and compiler builtin ffs * routines, therefore differs in spirit from the other bitops. * * ffs(value) returns 0 if value is 0 or the position of the first * set bit if value is nonzero. The first (least significant) bit * is at position 1. */ #define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x)) /** * fls - find last set bit in word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffs, but returns the position of the most significant set bit. * * fls(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 32. */ static __always_inline int fls(unsigned int x) { int r; if (__builtin_constant_p(x)) return x ? 32 - __builtin_clz(x) : 0; #ifdef CONFIG_X86_64 /* * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before, except that the * top 32 bits will be cleared. * * We cannot do this on 32 bits because at the very least some * 486 CPUs did not behave this way. */ asm("bsrl %1,%0" : "=r" (r) : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); #else asm("bsrl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); #endif return r + 1; } /** * fls64 - find last set bit in a 64-bit word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffsll, but returns the position of the most significant set bit. * * fls64(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ #ifdef CONFIG_X86_64 static __always_inline int fls64(__u64 x) { int bitpos = -1; if (__builtin_constant_p(x)) return x ? 64 - __builtin_clzll(x) : 0; /* * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before. */ asm("bsrq %1,%q0" : "+r" (bitpos) : ASM_INPUT_RM (x)); return bitpos + 1; } #else #include <asm-generic/bitops/fls64.h> #endif #include <asm-generic/bitops/sched.h> #include <asm/arch_hweight.h> #include <asm-generic/bitops/const_hweight.h> #include <asm-generic/bitops/instrumented-atomic.h> #include <asm-generic/bitops/instrumented-non-atomic.h> #include <asm-generic/bitops/instrumented-lock.h> #include <asm-generic/bitops/le.h> #include <asm-generic/bitops/ext2-atomic-setbit.h> #endif /* __KERNEL__ */ #endif /* _ASM_X86_BITOPS_H */
207 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM x86_fpu #if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FPU_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(x86_fpu, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu), TP_STRUCT__entry( __field(struct fpu *, fpu) __field(bool, load_fpu) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures; __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv; } ), TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->load_fpu, __entry->xfeatures, __entry->xcomp_bv ) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_before_restore, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_after_restore, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_src, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH asm/trace/ #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE fpu #endif /* _TRACE_FPU_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2009 IBM Corporation * Author: Mimi Zohar <zohar@us.ibm.com> */ #ifndef _LINUX_INTEGRITY_H #define _LINUX_INTEGRITY_H #include <linux/fs.h> #include <linux/iversion.h> enum integrity_status { INTEGRITY_PASS = 0, INTEGRITY_PASS_IMMUTABLE, INTEGRITY_FAIL, INTEGRITY_FAIL_IMMUTABLE, INTEGRITY_NOLABEL, INTEGRITY_NOXATTRS, INTEGRITY_UNKNOWN, }; #ifdef CONFIG_INTEGRITY extern void __init integrity_load_keys(void); #else static inline void integrity_load_keys(void) { } #endif /* CONFIG_INTEGRITY */ /* An inode's attributes for detection of changes */ struct integrity_inode_attributes { u64 version; /* track inode changes */ unsigned long ino; dev_t dev; }; /* * On stacked filesystems the i_version alone is not enough to detect file data * or metadata change. Additional metadata is required. */ static inline void integrity_inode_attrs_store(struct integrity_inode_attributes *attrs, u64 i_version, const struct inode *inode) { attrs->version = i_version; attrs->dev = inode->i_sb->s_dev; attrs->ino = inode->i_ino; } /* * On stacked filesystems detect whether the inode or its content has changed. */ static inline bool integrity_inode_attrs_changed(const struct integrity_inode_attributes *attrs, const struct inode *inode) { return (inode->i_sb->s_dev != attrs->dev || inode->i_ino != attrs->ino || !inode_eq_iversion(inode, attrs->version)); } #endif /* _LINUX_INTEGRITY_H */
93 92 92 91 92 1 1 1 90 92 92 92 93 93 91 93 94 94 94 94 94 93 93 91 93 93 93 92 91 93 93 93 92 92 92 93 92 92 93 1 92 92 93 93 90 92 92 90 92 93 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 // SPDX-License-Identifier: GPL-2.0-or-later /* * NETLINK Kernel-user communication protocol. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Patrick McHardy <kaber@trash.net> * * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith * added netlink_proto_exit * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br> * use nlk_sk, as sk->protinfo is on a diet 8) * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org> * - inc module use count of module that owns * the kernel socket in case userspace opens * socket of same protocol * - remove all module support, since netlink is * mandatory if CONFIG_NET=y these days */ #include <linux/module.h> #include <linux/bpf.h> #include <linux/capability.h> #include <linux/kernel.h> #include <linux/filter.h> #include <linux/init.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/security.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/bitops.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/audit.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> #include <linux/rhashtable.h> #include <asm/cacheflush.h> #include <linux/hash.h> #include <linux/net_namespace.h> #include <linux/nospec.h> #include <linux/btf_ids.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> #define CREATE_TRACE_POINTS #include <trace/events/netlink.h> #include "af_netlink.h" #include "genetlink.h" struct listeners { struct rcu_head rcu; unsigned long masks[]; }; /* state bits */ #define NETLINK_S_CONGESTED 0x0 static inline int netlink_is_kernel(struct sock *sk) { return nlk_test_bit(KERNEL_SOCKET, sk); } struct netlink_table *nl_table __read_mostly; EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS]; static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { "nlk_cb_mutex-ROUTE", "nlk_cb_mutex-1", "nlk_cb_mutex-USERSOCK", "nlk_cb_mutex-FIREWALL", "nlk_cb_mutex-SOCK_DIAG", "nlk_cb_mutex-NFLOG", "nlk_cb_mutex-XFRM", "nlk_cb_mutex-SELINUX", "nlk_cb_mutex-ISCSI", "nlk_cb_mutex-AUDIT", "nlk_cb_mutex-FIB_LOOKUP", "nlk_cb_mutex-CONNECTOR", "nlk_cb_mutex-NETFILTER", "nlk_cb_mutex-IP6_FW", "nlk_cb_mutex-DNRTMSG", "nlk_cb_mutex-KOBJECT_UEVENT", "nlk_cb_mutex-GENERIC", "nlk_cb_mutex-17", "nlk_cb_mutex-SCSITRANSPORT", "nlk_cb_mutex-ECRYPTFS", "nlk_cb_mutex-RDMA", "nlk_cb_mutex-CRYPTO", "nlk_cb_mutex-SMC", "nlk_cb_mutex-23", "nlk_cb_mutex-24", "nlk_cb_mutex-25", "nlk_cb_mutex-26", "nlk_cb_mutex-27", "nlk_cb_mutex-28", "nlk_cb_mutex-29", "nlk_cb_mutex-30", "nlk_cb_mutex-31", "nlk_cb_mutex-MAX_LINKS" }; static int netlink_dump(struct sock *sk, bool lock_taken); /* nl_table locking explained: * Lookup and traversal are protected with an RCU read-side lock. Insertion * and removal are protected with per bucket lock while using RCU list * modification primitives and may run in parallel to RCU protected lookups. * Destruction of the Netlink socket may only occur *after* nl_table_lock has * been acquired * either during or after the socket has been removed from * the list and after an RCU grace period. */ DEFINE_RWLOCK(nl_table_lock); EXPORT_SYMBOL_GPL(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); static BLOCKING_NOTIFIER_HEAD(netlink_chain); static const struct rhashtable_params netlink_rhashtable_params; void do_trace_netlink_extack(const char *msg) { trace_netlink_extack(msg); } EXPORT_SYMBOL(do_trace_netlink_extack); static inline u32 netlink_group_mask(u32 group) { if (group > 32) return 0; return group ? 1 << (group - 1) : 0; } static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, gfp_t gfp_mask) { unsigned int len = skb->len; struct sk_buff *new; new = alloc_skb(len, gfp_mask); if (new == NULL) return NULL; NETLINK_CB(new).portid = NETLINK_CB(skb).portid; NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; NETLINK_CB(new).creds = NETLINK_CB(skb).creds; skb_put_data(new, skb->data, len); return new; } static unsigned int netlink_tap_net_id; struct netlink_tap_net { struct list_head netlink_tap_all; struct mutex netlink_tap_lock; }; int netlink_add_tap(struct netlink_tap *nt) { struct net *net = dev_net(nt->dev); struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); if (unlikely(nt->dev->type != ARPHRD_NETLINK)) return -EINVAL; mutex_lock(&nn->netlink_tap_lock); list_add_rcu(&nt->list, &nn->netlink_tap_all); mutex_unlock(&nn->netlink_tap_lock); __module_get(nt->module); return 0; } EXPORT_SYMBOL_GPL(netlink_add_tap); static int __netlink_remove_tap(struct netlink_tap *nt) { struct net *net = dev_net(nt->dev); struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); bool found = false; struct netlink_tap *tmp; mutex_lock(&nn->netlink_tap_lock); list_for_each_entry(tmp, &nn->netlink_tap_all, list) { if (nt == tmp) { list_del_rcu(&nt->list); found = true; goto out; } } pr_warn("__netlink_remove_tap: %p not found\n", nt); out: mutex_unlock(&nn->netlink_tap_lock); if (found) module_put(nt->module); return found ? 0 : -ENODEV; } int netlink_remove_tap(struct netlink_tap *nt) { int ret; ret = __netlink_remove_tap(nt); synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(netlink_remove_tap); static __net_init int netlink_tap_init_net(struct net *net) { struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); INIT_LIST_HEAD(&nn->netlink_tap_all); mutex_init(&nn->netlink_tap_lock); return 0; } static struct pernet_operations netlink_tap_net_ops = { .init = netlink_tap_init_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), }; static bool netlink_filter_tap(const struct sk_buff *skb) { struct sock *sk = skb->sk; /* We take the more conservative approach and * whitelist socket protocols that may pass. */ switch (sk->sk_protocol) { case NETLINK_ROUTE: case NETLINK_USERSOCK: case NETLINK_SOCK_DIAG: case NETLINK_NFLOG: case NETLINK_XFRM: case NETLINK_FIB_LOOKUP: case NETLINK_NETFILTER: case NETLINK_GENERIC: return true; } return false; } static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *nskb; struct sock *sk = skb->sk; int ret = -ENOMEM; if (!net_eq(dev_net(dev), sock_net(sk))) return 0; dev_hold(dev); if (is_vmalloc_addr(skb->head)) nskb = netlink_to_full_skb(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); nskb->pkt_type = netlink_is_kernel(sk) ? PACKET_KERNEL : PACKET_USER; skb_reset_network_header(nskb); ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); } dev_put(dev); return ret; } static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn) { int ret; struct netlink_tap *tmp; if (!netlink_filter_tap(skb)) return; list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) { ret = __netlink_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) break; } } static void netlink_deliver_tap(struct net *net, struct sk_buff *skb) { struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id); rcu_read_lock(); if (unlikely(!list_empty(&nn->netlink_tap_all))) __netlink_deliver_tap(skb, nn); rcu_read_unlock(); } static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, struct sk_buff *skb) { if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) netlink_deliver_tap(sock_net(dst), skb); } static void netlink_overrun(struct sock *sk) { if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) { if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { WRITE_ONCE(sk->sk_err, ENOBUFS); sk_error_report(sk); } } atomic_inc(&sk->sk_drops); } static void netlink_rcv_wake(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty_lockless(&sk->sk_receive_queue)) clear_bit(NETLINK_S_CONGESTED, &nlk->state); if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } static void netlink_skb_destructor(struct sk_buff *skb) { if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) vfree_atomic(skb->head); skb->head = NULL; } if (skb->sk != NULL) sock_rfree(skb); } static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { WARN_ON(skb->sk != NULL); skb->sk = sk; skb->destructor = netlink_skb_destructor; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static void netlink_sock_destruct(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->cb_running) { if (nlk->cb.done) nlk->cb.done(&nlk->cb); module_put(nlk->cb.module); kfree_skb(nlk->cb.skb); } skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(nlk_sk(sk)->groups); } static void netlink_sock_destruct_work(struct work_struct *work) { struct netlink_sock *nlk = container_of(work, struct netlink_sock, work); sk_free(&nlk->sk); } /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves * this, _but_ remember, it adds useless work on UP machines. */ void netlink_table_grab(void) __acquires(nl_table_lock) { might_sleep(); write_lock_irq(&nl_table_lock); if (atomic_read(&nl_table_users)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&nl_table_wait, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&nl_table_users) == 0) break; write_unlock_irq(&nl_table_lock); schedule(); write_lock_irq(&nl_table_lock); } __set_current_state(TASK_RUNNING); remove_wait_queue(&nl_table_wait, &wait); } } void netlink_table_ungrab(void) __releases(nl_table_lock) { write_unlock_irq(&nl_table_lock); wake_up(&nl_table_wait); } static inline void netlink_lock_table(void) { unsigned long flags; /* read_lock() synchronizes us to netlink_table_grab */ read_lock_irqsave(&nl_table_lock, flags); atomic_inc(&nl_table_users); read_unlock_irqrestore(&nl_table_lock, flags); } static inline void netlink_unlock_table(void) { if (atomic_dec_and_test(&nl_table_users)) wake_up(&nl_table_wait); } struct netlink_compare_arg { possible_net_t pnet; u32 portid; }; /* Doing sizeof directly may yield 4 extra bytes on 64-bit. */ #define netlink_compare_arg_len \ (offsetof(struct netlink_compare_arg, portid) + sizeof(u32)) static inline int netlink_compare(struct rhashtable_compare_arg *arg, const void *ptr) { const struct netlink_compare_arg *x = arg->key; const struct netlink_sock *nlk = ptr; return nlk->portid != x->portid || !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } static void netlink_compare_arg_init(struct netlink_compare_arg *arg, struct net *net, u32 portid) { memset(arg, 0, sizeof(*arg)); write_pnet(&arg->pnet, net); arg->portid = portid; } static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid, struct net *net) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, net, portid); return rhashtable_lookup_fast(&table->hash, &arg, netlink_rhashtable_params); } static int __netlink_insert(struct netlink_table *table, struct sock *sk) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); return rhashtable_lookup_insert_key(&table->hash, &arg, &nlk_sk(sk)->node, netlink_rhashtable_params); } static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { struct netlink_table *table = &nl_table[protocol]; struct sock *sk; rcu_read_lock(); sk = __netlink_lookup(table, portid, net); if (sk) sock_hold(sk); rcu_read_unlock(); return sk; } static const struct proto_ops netlink_ops; static void netlink_update_listeners(struct sock *sk) { struct netlink_table *tbl = &nl_table[sk->sk_protocol]; unsigned long mask; unsigned int i; struct listeners *listeners; listeners = nl_deref_protected(tbl->listeners); if (!listeners) return; for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { mask = 0; sk_for_each_bound(sk, &tbl->mc_list) { if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } listeners->masks[i] = mask; } /* this function is only called with the netlink table "grabbed", which * makes sure updates are visible before bind or setsockopt return. */ } static int netlink_insert(struct sock *sk, u32 portid) { struct netlink_table *table = &nl_table[sk->sk_protocol]; int err; lock_sock(sk); err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; if (nlk_sk(sk)->bound) goto err; /* portid can be read locklessly from netlink_getname(). */ WRITE_ONCE(nlk_sk(sk)->portid, portid); sock_hold(sk); err = __netlink_insert(table, sk); if (err) { /* In case the hashtable backend returns with -EBUSY * from here, it must not escape to the caller. */ if (unlikely(err == -EBUSY)) err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; sock_put(sk); goto err; } /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); /* Paired with lockless reads from netlink_bind(), * netlink_connect() and netlink_sendmsg(). */ WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); return err; } static void netlink_remove(struct sock *sk) { struct netlink_table *table; table = &nl_table[sk->sk_protocol]; if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node, netlink_rhashtable_params)) { WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } netlink_table_grab(); if (nlk_sk(sk)->subscriptions) { __sk_del_bind_node(sk); netlink_update_listeners(sk); } if (sk->sk_protocol == NETLINK_GENERIC) atomic_inc(&genl_sk_destructing_cnt); netlink_table_ungrab(); } static struct proto netlink_proto = { .name = "NETLINK", .owner = THIS_MODULE, .obj_size = sizeof(struct netlink_sock), }; static int __netlink_create(struct net *net, struct socket *sock, struct mutex *dump_cb_mutex, int protocol, int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); nlk = nlk_sk(sk); mutex_init(&nlk->nl_cb_mutex); lockdep_set_class_and_name(&nlk->nl_cb_mutex, nlk_cb_mutex_keys + protocol, nlk_cb_mutex_key_strings[protocol]); nlk->dump_cb_mutex = dump_cb_mutex; init_waitqueue_head(&nlk->wait); sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; } static int netlink_create(struct net *net, struct socket *sock, int protocol, int kern) { struct module *module = NULL; struct mutex *cb_mutex; struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release)(struct sock *sock, unsigned long *groups); int err = 0; sock->state = SS_UNCONNECTED; if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; if (protocol < 0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; protocol = array_index_nospec(protocol, MAX_LINKS); netlink_lock_table(); #ifdef CONFIG_MODULES if (!nl_table[protocol].registered) { netlink_unlock_table(); request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); netlink_lock_table(); } #endif if (nl_table[protocol].registered && try_module_get(nl_table[protocol].module)) module = nl_table[protocol].module; else err = -EPROTONOSUPPORT; cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; release = nl_table[protocol].release; netlink_unlock_table(); if (err < 0) goto out; err = __netlink_create(net, sock, cb_mutex, protocol, kern); if (err < 0) goto out_module; sock_prot_inuse_add(net, &netlink_proto, 1); nlk = nlk_sk(sock->sk); nlk->module = module; nlk->netlink_bind = bind; nlk->netlink_unbind = unbind; nlk->netlink_release = release; out: return err; out_module: module_put(module); goto out; } static void deferred_put_nlk_sk(struct rcu_head *head) { struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); struct sock *sk = &nlk->sk; kfree(nlk->groups); nlk->groups = NULL; if (!refcount_dec_and_test(&sk->sk_refcnt)) return; if (nlk->cb_running && nlk->cb.done) { INIT_WORK(&nlk->work, netlink_sock_destruct_work); schedule_work(&nlk->work); return; } sk_free(sk); } static int netlink_release(struct socket *sock) { struct sock *sk = sock->sk; struct netlink_sock *nlk; if (!sk) return 0; netlink_remove(sk); sock_orphan(sk); nlk = nlk_sk(sk); /* * OK. Socket is unlinked, any packets that arrive now * will be purged. */ if (nlk->netlink_release) nlk->netlink_release(sk, nlk->groups); /* must not acquire netlink_table_lock in any way again before unbind * and notifying genetlink is done as otherwise it might deadlock */ if (nlk->netlink_unbind) { int i; for (i = 0; i < nlk->ngroups; i++) if (test_bit(i, nlk->groups)) nlk->netlink_unbind(sock_net(sk), i + 1); } if (sk->sk_protocol == NETLINK_GENERIC && atomic_dec_return(&genl_sk_destructing_cnt) == 0) wake_up(&genl_sk_destructing_waitq); sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); skb_queue_purge(&sk->sk_write_queue); if (nlk->portid && nlk->bound) { struct netlink_notify n = { .net = sock_net(sk), .protocol = sk->sk_protocol, .portid = nlk->portid, }; blocking_notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); } module_put(nlk->module); if (netlink_is_kernel(sk)) { netlink_table_grab(); BUG_ON(nl_table[sk->sk_protocol].registered == 0); if (--nl_table[sk->sk_protocol].registered == 0) { struct listeners *old; old = nl_deref_protected(nl_table[sk->sk_protocol].listeners); RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL); kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; nl_table[sk->sk_protocol].unbind = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } netlink_table_ungrab(); } sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); /* Because struct net might disappear soon, do not keep a pointer. */ if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); /* Because of deferred_put_nlk_sk and use of work queue, * it is possible netns will be freed before this socket. */ sock_net_set(sk, &init_net); __netns_tracker_alloc(&init_net, &sk->ns_tracker, false, GFP_KERNEL); } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_table *table = &nl_table[sk->sk_protocol]; s32 portid = task_tgid_vnr(current); int err; s32 rover = -4096; bool ok; retry: cond_resched(); rcu_read_lock(); ok = !__netlink_lookup(table, portid, net); rcu_read_unlock(); if (!ok) { /* Bind collision, search negative portid values. */ if (rover == -4096) /* rover will be in range [S32_MIN, -4097] */ rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN); else if (rover >= -4096) rover = -4097; portid = rover--; goto retry; } err = netlink_insert(sk, portid); if (err == -EADDRINUSE) goto retry; /* If 2 threads race to autobind, that is fine. */ if (err == -EBUSY) err = 0; return err; } /** * __netlink_ns_capable - General netlink message capability test * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace. * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in the user namespace @user_ns. */ bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *user_ns, int cap) { return ((nsp->flags & NETLINK_SKB_DST) || file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(__netlink_ns_capable); /** * netlink_ns_capable - General netlink message capability test * @skb: socket buffer holding a netlink command from userspace * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in the user namespace @user_ns. */ bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *user_ns, int cap) { return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap); } EXPORT_SYMBOL(netlink_ns_capable); /** * netlink_capable - Netlink global message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap in all user namespaces. */ bool netlink_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, &init_user_ns, cap); } EXPORT_SYMBOL(netlink_capable); /** * netlink_net_capable - Netlink network namespace message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has the capability @cap over the network namespace of * the socket we received the message from. */ bool netlink_net_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap); } EXPORT_SYMBOL(netlink_net_capable); static inline int netlink_allowed(const struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].flags & flag) || ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN); } static void netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->subscriptions && !subscriptions) __sk_del_bind_node(sk); else if (!nlk->subscriptions && subscriptions) sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); nlk->subscriptions = subscriptions; } static int netlink_realloc_groups(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); unsigned int groups; unsigned long *new_groups; int err = 0; netlink_table_grab(); groups = nl_table[sk->sk_protocol].groups; if (!nl_table[sk->sk_protocol].registered) { err = -ENOENT; goto out_unlock; } if (nlk->ngroups >= groups) goto out_unlock; new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC); if (new_groups == NULL) { err = -ENOMEM; goto out_unlock; } memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0, NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups)); nlk->groups = new_groups; nlk->ngroups = groups; out_unlock: netlink_table_ungrab(); return err; } static void netlink_undo_bind(int group, long unsigned int groups, struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); int undo; if (!nlk->netlink_unbind) return; for (undo = 0; undo < group; undo++) if (test_bit(undo, &groups)) nlk->netlink_unbind(sock_net(sk), undo + 1); } static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err = 0; unsigned long groups; bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; if (nladdr->nl_family != AF_NETLINK) return -EINVAL; groups = nladdr->nl_groups; /* Only superuser is allowed to listen multicasts */ if (groups) { if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; } if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; /* Paired with WRITE_ONCE() in netlink_insert() */ bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); if (nladdr->nl_pid != nlk->portid) return -EINVAL; } if (nlk->netlink_bind && groups) { int group; /* nl_groups is a u32, so cap the maximum groups we can bind */ for (group = 0; group < BITS_PER_TYPE(u32); group++) { if (!test_bit(group, &groups)) continue; err = nlk->netlink_bind(net, group + 1); if (!err) continue; netlink_undo_bind(group, groups, sk); return err; } } /* No need for barriers here as we return to user-space without * using any of the bound attributes. */ netlink_lock_table(); if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); if (err) { netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk); goto unlock; } } if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) goto unlock; netlink_unlock_table(); netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + hweight32(groups) - hweight32(nlk->groups[0])); nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups; netlink_update_listeners(sk); netlink_table_ungrab(); return 0; unlock: netlink_unlock_table(); return err; } static int netlink_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { int err = 0; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; if (alen < sizeof(addr->sa_family)) return -EINVAL; if (addr->sa_family == AF_UNSPEC) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, 0); WRITE_ONCE(nlk->dst_group, 0); return 0; } if (addr->sa_family != AF_NETLINK) return -EINVAL; if (alen < sizeof(struct sockaddr_nl)) return -EINVAL; if ((nladdr->nl_groups || nladdr->nl_pid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; /* No need for barriers here as we return to user-space without * using any of the bound attributes. * Paired with WRITE_ONCE() in netlink_insert(). */ if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid); WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups)); } return err; } static int netlink_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr); nladdr->nl_family = AF_NETLINK; nladdr->nl_pad = 0; if (peer) { /* Paired with WRITE_ONCE() in netlink_connect() */ nladdr->nl_pid = READ_ONCE(nlk->dst_portid); nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group)); } else { /* Paired with WRITE_ONCE() in netlink_insert() */ nladdr->nl_pid = READ_ONCE(nlk->portid); netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); } return sizeof(*nladdr); } static int netlink_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { /* try to hand this ioctl down to the NIC drivers. */ return -ENOIOCTLCMD; } static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; struct netlink_sock *nlk; sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid); if (!sock) return ERR_PTR(-ECONNREFUSED); /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); /* dst_portid and sk_state can be changed in netlink_connect() */ if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED && READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } return sock; } struct sock *netlink_getsockbyfilp(struct file *filp) { struct inode *inode = file_inode(filp); struct sock *sock; if (!S_ISSOCK(inode->i_mode)) return ERR_PTR(-ENOTSOCK); sock = SOCKET_I(inode)->sk; if (sock->sk_family != AF_NETLINK) return ERR_PTR(-EINVAL); sock_hold(sock); return sock; } struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast) { size_t head_size = SKB_HEAD_ALIGN(size); struct sk_buff *skb; void *data; if (head_size <= PAGE_SIZE || broadcast) return alloc_skb(size, GFP_KERNEL); data = kvmalloc(head_size, GFP_KERNEL); if (!data) return NULL; skb = __build_skb(data, head_size); if (!skb) kvfree(data); else if (is_vmalloc_addr(data)) skb->destructor = netlink_skb_destructor; return skb; } /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the * reference is dropped. The skb is not send to the destination, just all * all error checks are performed and memory in the queue is reserved. * Return values: * < 0: error. skb freed, reference to sock dropped. * 0: continue * 1: repeat lookup - reference dropped while waiting for socket memory. */ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk) { struct netlink_sock *nlk; nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(NETLINK_S_CONGESTED, &nlk->state))) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); sock_put(sk); kfree_skb(skb); return -EAGAIN; } __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&nlk->wait, &wait); sock_put(sk); if (signal_pending(current)) { kfree_skb(skb); return sock_intr_errno(*timeo); } return 1; } netlink_skb_set_owner_r(skb, sk); return 0; } static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; netlink_deliver_tap(sock_net(sk), skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return len; } int netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = __netlink_sendskb(sk, skb); sock_put(sk); return len; } void netlink_detachskb(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); sock_put(sk); } static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; WARN_ON(skb->sk != NULL); delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, allocation); if (!nskb) return skb; consume_skb(skb); skb = nskb; } pskb_expand_head(skb, 0, -delta, (allocation & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); return skb; } static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, struct sock *ssk) { int ret; struct netlink_sock *nlk = nlk_sk(sk); ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; netlink_deliver_tap_kernel(sk, ssk, skb); nlk->netlink_rcv(skb); consume_skb(skb); } else { kfree_skb(skb); } sock_put(sk); return ret; } int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 portid, int nonblock) { struct sock *sk; int err; long timeo; skb = netlink_trim(skb, gfp_any()); timeo = sock_sndtimeo(ssk, nonblock); retry: sk = netlink_getsockbyportid(ssk, portid); if (IS_ERR(sk)) { kfree_skb(skb); return PTR_ERR(sk); } if (netlink_is_kernel(sk)) return netlink_unicast_kernel(sk, skb, ssk); if (sk_filter(sk, skb)) { err = skb->len; kfree_skb(skb); sock_put(sk); return err; } err = netlink_attachskb(sk, skb, &timeo, ssk); if (err == 1) goto retry; if (err) return err; return netlink_sendskb(sk, skb); } EXPORT_SYMBOL(netlink_unicast); int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; struct listeners *listeners; BUG_ON(!netlink_is_kernel(sk)); rcu_read_lock(); listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); if (listeners && group - 1 < nl_table[sk->sk_protocol].groups) res = test_bit(group - 1, listeners->masks); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(netlink_has_listeners); bool netlink_strict_get_check(struct sk_buff *skb) { return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); } EXPORT_SYMBOL_GPL(netlink_strict_get_check); static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); } return -1; } struct netlink_broadcast_data { struct sock *exclude_sk; struct net *net; u32 portid; u32 group; int failure; int delivery_failure; int congested; int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); void *tx_data; }; static void do_one_broadcast(struct sock *sk, struct netlink_broadcast_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int val; if (p->exclude_sk == sk) return; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) return; if (!net_eq(sock_net(sk), p->net)) { if (!nlk_test_bit(LISTEN_ALL_NSID, sk)) return; if (!peernet_has_id(sock_net(sk), p->net)) return; if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, CAP_NET_BROADCAST)) return; } if (p->failure) { netlink_overrun(sk); return; } sock_hold(sk); if (p->skb2 == NULL) { if (skb_shared(p->skb)) { p->skb2 = skb_clone(p->skb, p->allocation); } else { p->skb2 = skb_get(p->skb); /* * skb ownership may have been set when * delivered to a previous socket. */ skb_orphan(p->skb2); } } if (p->skb2 == NULL) { netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; goto out; } if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) NETLINK_CB(p->skb2).nsid_is_set = true; val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); if (nlk_test_bit(BROADCAST_SEND_ERROR, sk)) p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; p->skb2 = NULL; } out: sock_put(sk); } int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; struct sock *sk; skb = netlink_trim(skb, allocation); info.exclude_sk = ssk; info.net = net; info.portid = portid; info.group = group; info.failure = 0; info.delivery_failure = 0; info.congested = 0; info.delivered = 0; info.allocation = allocation; info.skb = skb; info.skb2 = NULL; info.tx_filter = filter; info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ netlink_lock_table(); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); consume_skb(skb); netlink_unlock_table(); if (info.delivery_failure) { kfree_skb(info.skb2); return -ENOBUFS; } consume_skb(info.skb2); if (info.delivered) { if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } return -ESRCH; } EXPORT_SYMBOL(netlink_broadcast_filtered); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation) { return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, NULL, NULL); } EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { struct sock *exclude_sk; u32 portid; u32 group; int code; }; static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int ret = 0; if (sk == p->exclude_sk) goto out; if (!net_eq(sock_net(sk), sock_net(p->exclude_sk))) goto out; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) goto out; if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) { ret = 1; goto out; } WRITE_ONCE(sk->sk_err, p->code); sk_error_report(sk); out: return ret; } /** * netlink_set_err - report error to broadcast listeners * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; unsigned long flags; struct sock *sk; int ret = 0; info.exclude_sk = ssk; info.portid = portid; info.group = group; /* sk->sk_err wants a positive error value */ info.code = -code; read_lock_irqsave(&nl_table_lock, flags); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); read_unlock_irqrestore(&nl_table_lock, flags); return ret; } EXPORT_SYMBOL(netlink_set_err); /* must be called with netlink table grabbed */ static void netlink_update_socket_mc(struct netlink_sock *nlk, unsigned int group, int is_new) { int old, new = !!is_new, subscriptions; old = test_bit(group - 1, nlk->groups); subscriptions = nlk->subscriptions - old + new; __assign_bit(group - 1, nlk->groups, new); netlink_update_subscriptions(&nlk->sk, subscriptions); netlink_update_listeners(&nlk->sk); } static int netlink_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int val = 0; int nr = -1; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (optlen >= sizeof(int) && copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { case NETLINK_PKTINFO: nr = NETLINK_F_RECV_PKTINFO; break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { int err; if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; if (!val || val - 1 >= nlk->ngroups) return -EINVAL; if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { err = nlk->netlink_bind(sock_net(sk), val); if (err) return err; } netlink_table_grab(); netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) nlk->netlink_unbind(sock_net(sk), val); break; } case NETLINK_BROADCAST_ERROR: nr = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val); if (val) { clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } break; case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; nr = NETLINK_F_LISTEN_ALL_NSID; break; case NETLINK_CAP_ACK: nr = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: nr = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: nr = NETLINK_F_STRICT_CHK; break; default: return -ENOPROTOOPT; } if (nr >= 0) assign_bit(nr, &nlk->flags, val); return 0; } static int netlink_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int flag; int len, val; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case NETLINK_PKTINFO: flag = NETLINK_F_RECV_PKTINFO; break; case NETLINK_BROADCAST_ERROR: flag = NETLINK_F_BROADCAST_SEND_ERROR; break; case NETLINK_NO_ENOBUFS: flag = NETLINK_F_RECV_NO_ENOBUFS; break; case NETLINK_LIST_MEMBERSHIPS: { int pos, idx, shift, err = 0; netlink_lock_table(); for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { if (len - pos < sizeof(u32)) break; idx = pos / sizeof(unsigned long); shift = (pos % sizeof(unsigned long)) * 8; if (put_user((u32)(nlk->groups[idx] >> shift), (u32 __user *)(optval + pos))) { err = -EFAULT; break; } } if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); return err; } case NETLINK_LISTEN_ALL_NSID: flag = NETLINK_F_LISTEN_ALL_NSID; break; case NETLINK_CAP_ACK: flag = NETLINK_F_CAP_ACK; break; case NETLINK_EXT_ACK: flag = NETLINK_F_EXT_ACK; break; case NETLINK_GET_STRICT_CHK: flag = NETLINK_F_STRICT_CHK; break; default: return -ENOPROTOOPT; } if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = test_bit(flag, &nlk->flags); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct nl_pktinfo info; info.group = NETLINK_CB(skb).dst_group; put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { if (!NETLINK_CB(skb).nsid_is_set) return; put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), &NETLINK_CB(skb).nsid); } static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); u32 dst_portid; u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; u32 netlink_skb_flags = 0; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (len == 0) { pr_warn_once("Zero length message leads to an empty skb\n"); return -ENODATA; } err = scm_send(sock, msg, &scm, true); if (err < 0) return err; if (msg->msg_namelen) { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_nl)) goto out; if (addr->nl_family != AF_NETLINK) goto out; dst_portid = addr->nl_pid; dst_group = ffs(addr->nl_groups); err = -EPERM; if ((dst_group || dst_portid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) goto out; netlink_skb_flags |= NETLINK_SKB_DST; } else { /* Paired with WRITE_ONCE() in netlink_connect() */ dst_portid = READ_ONCE(nlk->dst_portid); dst_group = READ_ONCE(nlk->dst_group); } /* Paired with WRITE_ONCE() in netlink_insert() */ if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; } else { /* Ensure nlk is hashed and visible. */ smp_rmb(); } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = netlink_alloc_large_skb(len, dst_group); if (skb == NULL) goto out; NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).creds = scm.creds; NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { kfree_skb(skb); goto out; } err = security_netlink_send(sk, skb); if (err) { kfree_skb(skb); goto out; } if (dst_group) { refcount_inc(&skb->users); netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL); } err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT); out: scm_destroy(&scm); return err; } static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct scm_cookie scm; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); size_t copied, max_recvmsg_len; struct sk_buff *skb, *data_skb; int err, ret; if (flags & MSG_OOB) return -EOPNOTSUPP; copied = 0; skb = skb_recv_datagram(sk, flags, &err); if (skb == NULL) goto out; data_skb = skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES if (unlikely(skb_shinfo(skb)->frag_list)) { /* * If this skb has a frag_list, then here that means that we * will have to use the frag_list skb's data for compat tasks * and the regular skb's data for normal (non-compat) tasks. * * If we need to send the compat skb, assign it to the * 'data_skb' variable so that it will be used below for data * copying. We keep 'skb' for everything else, including * freeing both later. */ if (flags & MSG_CMSG_COMPAT) data_skb = skb_shinfo(skb)->frag_list; } #endif /* Record the max length of recvmsg() calls for future allocations */ max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len); max_recvmsg_len = min_t(size_t, max_recvmsg_len, SKB_WITH_OVERHEAD(32768)); WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len); copied = data_skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).portid; addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); } if (nlk_test_bit(RECV_PKTINFO, sk)) netlink_cmsg_recv_pktinfo(msg, skb); if (nlk_test_bit(LISTEN_ALL_NSID, sk)) netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); if (flags & MSG_TRUNC) copied = data_skb->len; skb_free_datagram(sk, skb); if (READ_ONCE(nlk->cb_running) && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk, false); if (ret) { WRITE_ONCE(sk->sk_err, -ret); sk_error_report(sk); } } scm_recv(sock, msg, &scm, flags); out: netlink_rcv_wake(sk); return err ? : copied; } static void netlink_data_ready(struct sock *sk) { BUG(); } /* * We export these functions to other modules. They provide a * complete set of kernel non-blocking support for message * queueing. */ struct sock * __netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg) { struct socket *sock; struct sock *sk; struct netlink_sock *nlk; struct listeners *listeners = NULL; struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL; unsigned int groups; BUG_ON(!nl_table); if (unit < 0 || unit >= MAX_LINKS) return NULL; if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; if (!cfg || cfg->groups < 32) groups = 32; else groups = cfg->groups; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; sk->sk_data_ready = netlink_data_ready; if (cfg && cfg->input) nlk_sk(sk)->netlink_rcv = cfg->input; if (netlink_insert(sk, 0)) goto out_sock_release; nlk = nlk_sk(sk); set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags); netlink_table_grab(); if (!nl_table[unit].registered) { nl_table[unit].groups = groups; rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].unbind = cfg->unbind; nl_table[unit].release = cfg->release; nl_table[unit].flags = cfg->flags; } nl_table[unit].registered = 1; } else { kfree(listeners); nl_table[unit].registered++; } netlink_table_ungrab(); return sk; out_sock_release: kfree(listeners); netlink_kernel_release(sk); return NULL; out_sock_release_nosk: sock_release(sock); return NULL; } EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) { if (sk == NULL || sk->sk_socket == NULL) return; sock_release(sk->sk_socket); } EXPORT_SYMBOL(netlink_kernel_release); int __netlink_change_ngroups(struct sock *sk, unsigned int groups) { struct listeners *new, *old; struct netlink_table *tbl = &nl_table[sk->sk_protocol]; if (groups < 32) groups = 32; if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); if (!new) return -ENOMEM; old = nl_deref_protected(tbl->listeners); memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); rcu_assign_pointer(tbl->listeners, new); kfree_rcu(old, rcu); } tbl->groups = groups; return 0; } /** * netlink_change_ngroups - change number of multicast groups * * This changes the number of multicast groups that are available * on a certain netlink family. Note that it is not possible to * change the number of groups to below 32. Also note that it does * not implicitly call netlink_clear_multicast_users() when the * number of groups is reduced. * * @sk: The kernel netlink socket, as returned by netlink_kernel_create(). * @groups: The new number of groups. */ int netlink_change_ngroups(struct sock *sk, unsigned int groups) { int err; netlink_table_grab(); err = __netlink_change_ngroups(sk, groups); netlink_table_ungrab(); return err; } void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) { struct sock *sk; struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; sk_for_each_bound(sk, &tbl->mc_list) netlink_update_socket_mc(nlk_sk(sk), group, 0); } struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { struct nlmsghdr *nlh; int size = nlmsg_msg_size(len); nlh = skb_put(skb, NLMSG_ALIGN(size)); nlh->nlmsg_type = type; nlh->nlmsg_len = size; nlh->nlmsg_flags = flags; nlh->nlmsg_pid = portid; nlh->nlmsg_seq = seq; if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0) memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size); return nlh; } EXPORT_SYMBOL(__nlmsg_put); static size_t netlink_ack_tlv_len(struct netlink_sock *nlk, int err, const struct netlink_ext_ack *extack) { size_t tlvlen; if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags)) return 0; tlvlen = 0; if (extack->_msg) tlvlen += nla_total_size(strlen(extack->_msg) + 1); if (extack->cookie_len) tlvlen += nla_total_size(extack->cookie_len); /* Following attributes are only reported as error (not warning) */ if (!err) return tlvlen; if (extack->bad_attr) tlvlen += nla_total_size(sizeof(u32)); if (extack->policy) tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); if (extack->miss_type) tlvlen += nla_total_size(sizeof(u32)); if (extack->miss_nest) tlvlen += nla_total_size(sizeof(u32)); return tlvlen; } static void netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb, const struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { if (extack->_msg) WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); if (extack->cookie_len) WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, extack->cookie_len, extack->cookie)); if (!err) return; if (extack->bad_attr && !WARN_ON((u8 *)extack->bad_attr < in_skb->data || (u8 *)extack->bad_attr >= in_skb->data + in_skb->len)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, (u8 *)extack->bad_attr - (const u8 *)nlh)); if (extack->policy) netlink_policy_dump_write_attr(skb, extack->policy, NLMSGERR_ATTR_POLICY); if (extack->miss_type) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE, extack->miss_type)); if (extack->miss_nest && !WARN_ON((u8 *)extack->miss_nest < in_skb->data || (u8 *)extack->miss_nest > in_skb->data + in_skb->len)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST, (u8 *)extack->miss_nest - (const u8 *)nlh)); } /* * It looks a bit ugly. * It would be better to create kernel thread. */ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, struct netlink_callback *cb, struct netlink_ext_ack *extack) { struct nlmsghdr *nlh; size_t extack_len; nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), NLM_F_MULTI | cb->answer_flags); if (WARN_ON(!nlh)) return -ENOBUFS; nl_dump_check_consistent(cb, nlh); memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); extack_len = netlink_ack_tlv_len(nlk, nlk->dump_done_errno, extack); if (extack_len) { nlh->nlmsg_flags |= NLM_F_ACK_TLVS; if (skb_tailroom(skb) >= extack_len) { netlink_ack_tlv_fill(cb->skb, skb, cb->nlh, nlk->dump_done_errno, extack); nlmsg_end(skb, nlh); } } return 0; } static int netlink_dump(struct sock *sk, bool lock_taken) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; size_t max_recvmsg_len; struct module *module; int err = -ENOBUFS; int alloc_min_size; int alloc_size; if (!lock_taken) mutex_lock(&nlk->nl_cb_mutex); if (!nlk->cb_running) { err = -EINVAL; goto errout_skb; } if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; /* NLMSG_GOODSIZE is small to avoid high order allocations being * required, but it makes sense to _attempt_ a 16K bytes allocation * to reduce number of system calls on dump operations, if user * ever provided a big enough buffer. */ cb = &nlk->cb; alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len); if (alloc_min_size < max_recvmsg_len) { alloc_size = max_recvmsg_len; skb = alloc_skb(alloc_size, (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; skb = alloc_skb(alloc_size, GFP_KERNEL); } if (!skb) goto errout_skb; /* Trim skb to allocated size. User is expected to provide buffer as * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at * netlink_recvmsg())). dump will pack as many smaller messages as * could fit within the allocated skb. skb is typically allocated * with larger space than required (could be as much as near 2x the * requested size with align to next power of 2 approach). Allowing * dump to use the excess space makes it difficult for a user to have a * reasonable static buffer based on the expected largest dump of a * single netdev. The outcome is MSG_TRUNC error. */ skb_reserve(skb, skb_tailroom(skb) - alloc_size); /* Make sure malicious BPF programs can not read unitialized memory * from skb->head -> skb->data */ skb_reset_network_header(skb); skb_reset_mac_header(skb); netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) { struct mutex *extra_mutex = nlk->dump_cb_mutex; cb->extack = &extack; if (cb->flags & RTNL_FLAG_DUMP_UNLOCKED) extra_mutex = NULL; if (extra_mutex) mutex_lock(extra_mutex); nlk->dump_done_errno = cb->dump(skb, cb); if (extra_mutex) mutex_unlock(extra_mutex); /* EMSGSIZE plus something already in the skb means * that there's more to dump but current skb has filled up. * If the callback really wants to return EMSGSIZE to user space * it needs to do so again, on the next cb->dump() call, * without putting data in the skb. */ if (nlk->dump_done_errno == -EMSGSIZE && skb->len) nlk->dump_done_errno = skb->len; cb->extack = NULL; } if (nlk->dump_done_errno > 0 || skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { mutex_unlock(&nlk->nl_cb_mutex); if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); return 0; } if (netlink_dump_done(nlk, skb, cb, &extack)) goto errout_skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES /* frag_list skb's data is used for compat tasks * and the regular skb's data for normal (non-compat) tasks. * See netlink_recvmsg(). */ if (unlikely(skb_shinfo(skb)->frag_list)) { if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack)) goto errout_skb; } #endif if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); if (cb->done) cb->done(cb); WRITE_ONCE(nlk->cb_running, false); module = cb->module; skb = cb->skb; mutex_unlock(&nlk->nl_cb_mutex); module_put(module); consume_skb(skb); return 0; errout_skb: mutex_unlock(&nlk->nl_cb_mutex); kfree_skb(skb); return err; } int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { struct netlink_callback *cb; struct netlink_sock *nlk; struct sock *sk; int ret; refcount_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { ret = -ECONNREFUSED; goto error_free; } nlk = nlk_sk(sk); mutex_lock(&nlk->nl_cb_mutex); /* A dump is in progress... */ if (nlk->cb_running) { ret = -EBUSY; goto error_unlock; } /* add reference of module which cb->dump belongs to */ if (!try_module_get(control->module)) { ret = -EPROTONOSUPPORT; goto error_unlock; } cb = &nlk->cb; memset(cb, 0, sizeof(*cb)); cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; cb->data = control->data; cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; cb->flags = control->flags; cb->skb = skb; cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk); if (control->start) { cb->extack = control->extack; ret = control->start(cb); cb->extack = NULL; if (ret) goto error_put; } WRITE_ONCE(nlk->cb_running, true); nlk->dump_done_errno = INT_MAX; ret = netlink_dump(sk, true); sock_put(sk); if (ret) return ret; /* We successfully started a dump, by returning -EINTR we * signal not to send ACK even if it was requested. */ return -EINTR; error_put: module_put(control->module); error_unlock: sock_put(sk); mutex_unlock(&nlk->nl_cb_mutex); error_free: kfree_skb(skb); return ret; } EXPORT_SYMBOL(__netlink_dump_start); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { struct sk_buff *skb; struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); unsigned int flags = 0; size_t tlvlen; /* Error messages get the original request appened, unless the user * requests to cap the error message, and get extra error data if * requested. */ if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags)) payload += nlmsg_len(nlh); else flags |= NLM_F_CAPPED; tlvlen = netlink_ack_tlv_len(nlk, err, extack); if (tlvlen) flags |= NLM_F_ACK_TLVS; skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) goto err_skb; rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, sizeof(*errmsg), flags); if (!rep) goto err_bad_put; errmsg = nlmsg_data(rep); errmsg->error = err; errmsg->msg = *nlh; if (!(flags & NLM_F_CAPPED)) { if (!nlmsg_append(skb, nlmsg_len(nlh))) goto err_bad_put; memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh), nlmsg_len(nlh)); } if (tlvlen) netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack); nlmsg_end(skb, rep); nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid); return; err_bad_put: nlmsg_free(skb); err_skb: WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS); sk_error_report(NETLINK_CB(in_skb).sk); } EXPORT_SYMBOL(netlink_ack); int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { struct netlink_ext_ack extack; struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { int msglen; memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return 0; /* Only requests are handled by the kernel */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) goto ack; /* Skip control messages */ if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) netlink_ack(skb, nlh, err, &extack); skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } return 0; } EXPORT_SYMBOL(netlink_rcv_skb); /** * nlmsg_notify - send a notification netlink message * @sk: netlink socket to use * @skb: notification message * @portid: destination netlink portid for reports or 0 * @group: destination multicast group or 0 * @report: 1 to report back, 0 to disable * @flags: allocation flags */ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags) { int err = 0; if (group) { int exclude_portid = 0; if (report) { refcount_inc(&skb->users); exclude_portid = portid; } /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); if (err == -ESRCH) err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); if (!err) err = err2; } return err; } EXPORT_SYMBOL(nlmsg_notify); #ifdef CONFIG_PROC_FS struct nl_seq_iter { struct seq_net_private p; struct rhashtable_iter hti; int link; }; static void netlink_walk_start(struct nl_seq_iter *iter) { rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti); rhashtable_walk_start(&iter->hti); } static void netlink_walk_stop(struct nl_seq_iter *iter) { rhashtable_walk_stop(&iter->hti); rhashtable_walk_exit(&iter->hti); } static void *__netlink_seq_next(struct seq_file *seq) { struct nl_seq_iter *iter = seq->private; struct netlink_sock *nlk; do { for (;;) { nlk = rhashtable_walk_next(&iter->hti); if (IS_ERR(nlk)) { if (PTR_ERR(nlk) == -EAGAIN) continue; return nlk; } if (nlk) break; netlink_walk_stop(iter); if (++iter->link >= MAX_LINKS) return NULL; netlink_walk_start(iter); } } while (sock_net(&nlk->sk) != seq_file_net(seq)); return nlk; } static void *netlink_seq_start(struct seq_file *seq, loff_t *posp) __acquires(RCU) { struct nl_seq_iter *iter = seq->private; void *obj = SEQ_START_TOKEN; loff_t pos; iter->link = 0; netlink_walk_start(iter); for (pos = *posp; pos && obj && !IS_ERR(obj); pos--) obj = __netlink_seq_next(seq); return obj; } static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return __netlink_seq_next(seq); } static void netlink_native_seq_stop(struct seq_file *seq, void *v) { struct nl_seq_iter *iter = seq->private; if (iter->link >= MAX_LINKS) return; netlink_walk_stop(iter); } static int netlink_native_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk Eth Pid Groups " "Rmem Wmem Dump Locks Drops Inode\n"); } else { struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n", s, s->sk_protocol, nlk->portid, nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) ); } return 0; } #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__netlink { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct netlink_sock *, sk); }; DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk) static int netlink_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__netlink ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.sk = nlk_sk((struct sock *)v); return bpf_iter_run_prog(prog, &ctx); } static int netlink_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return netlink_native_seq_show(seq, v); if (v != SEQ_START_TOKEN) return netlink_prog_seq_show(prog, &meta, v); return 0; } static void netlink_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)netlink_prog_seq_show(prog, &meta, v); } netlink_native_seq_stop(seq, v); } #else static int netlink_seq_show(struct seq_file *seq, void *v) { return netlink_native_seq_show(seq, v); } static void netlink_seq_stop(struct seq_file *seq, void *v) { netlink_native_seq_stop(seq, v); } #endif static const struct seq_operations netlink_seq_ops = { .start = netlink_seq_start, .next = netlink_seq_next, .stop = netlink_seq_stop, .show = netlink_seq_show, }; #endif int netlink_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_register_notifier); int netlink_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_unregister_notifier); static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, .bind = netlink_bind, .connect = netlink_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, .poll = datagram_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = netlink_setsockopt, .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, }; static const struct net_proto_family netlink_family_ops = { .family = PF_NETLINK, .create = netlink_create, .owner = THIS_MODULE, /* for consistency 8) */ }; static int __net_init netlink_net_init(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops, sizeof(struct nl_seq_iter))) return -ENOMEM; #endif return 0; } static void __net_exit netlink_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("netlink", net->proc_net); #endif } static void __init netlink_add_usersock_entry(void) { struct listeners *listeners; int groups = 32; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) panic("netlink_add_usersock_entry: Cannot allocate listeners\n"); netlink_table_grab(); nl_table[NETLINK_USERSOCK].groups = groups; rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND; netlink_table_ungrab(); } static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, }; static inline u32 netlink_hash(const void *data, u32 len, u32 seed) { const struct netlink_sock *nlk = data; struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); } static const struct rhashtable_params netlink_rhashtable_params = { .head_offset = offsetof(struct netlink_sock, node), .key_len = netlink_compare_arg_len, .obj_hashfn = netlink_hash, .obj_cmpfn = netlink_compare, .automatic_shrinking = true, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) BTF_ID_LIST(btf_netlink_sock_id) BTF_ID(struct, netlink_sock) static const struct bpf_iter_seq_info netlink_seq_info = { .seq_ops = &netlink_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct nl_seq_iter), }; static struct bpf_iter_reg netlink_reg_info = { .target = "netlink", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__netlink, sk), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &netlink_seq_info, }; static int __init bpf_iter_register(void) { netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id; return bpf_iter_reg_target(&netlink_reg_info); } #endif static int __init netlink_proto_init(void) { int i; int err = proto_register(&netlink_proto, 0); if (err != 0) goto out; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) err = bpf_iter_register(); if (err) goto out; #endif BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table) goto panic; for (i = 0; i < MAX_LINKS; i++) { if (rhashtable_init(&nl_table[i].hash, &netlink_rhashtable_params) < 0) { while (--i > 0) rhashtable_destroy(&nl_table[i].hash); kfree(nl_table); goto panic; } } netlink_add_usersock_entry(); sock_register(&netlink_family_ops); register_pernet_subsys(&netlink_net_ops); register_pernet_subsys(&netlink_tap_net_ops); /* The netlink device handler may be needed early. */ rtnetlink_init(); out: return err; panic: panic("netlink_init: Cannot allocate nl_table\n"); } core_initcall(netlink_proto_init);
16 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM pagemap #if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGEMAP_H #include <linux/tracepoint.h> #include <linux/mm.h> #define PAGEMAP_MAPPED 0x0001u #define PAGEMAP_ANONYMOUS 0x0002u #define PAGEMAP_FILE 0x0004u #define PAGEMAP_SWAPCACHE 0x0008u #define PAGEMAP_SWAPBACKED 0x0010u #define PAGEMAP_MAPPEDDISK 0x0020u #define PAGEMAP_BUFFERS 0x0040u #define trace_pagemap_flags(folio) ( \ (folio_test_anon(folio) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ (folio_mapped(folio) ? PAGEMAP_MAPPED : 0) | \ (folio_test_swapcache(folio) ? PAGEMAP_SWAPCACHE : 0) | \ (folio_test_swapbacked(folio) ? PAGEMAP_SWAPBACKED : 0) | \ (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \ (folio_test_private(folio) ? PAGEMAP_BUFFERS : 0) \ ) TRACE_EVENT(mm_lru_insertion, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); __entry->lru = folio_lru_list(folio); __entry->flags = trace_pagemap_flags(folio); ), /* Flag format is based on page-types.c formatting for pagemap */ TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", __entry->folio, __entry->pfn, __entry->lru, __entry->flags & PAGEMAP_MAPPED ? "M" : " ", __entry->flags & PAGEMAP_ANONYMOUS ? "a" : "f", __entry->flags & PAGEMAP_SWAPCACHE ? "s" : " ", __entry->flags & PAGEMAP_SWAPBACKED ? "b" : " ", __entry->flags & PAGEMAP_MAPPEDDISK ? "d" : " ", __entry->flags & PAGEMAP_BUFFERS ? "B" : " ") ); TRACE_EVENT(mm_lru_activate, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); ), TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn) ); #endif /* _TRACE_PAGEMAP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
231 245 7 170 246 5 154 223 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SPINLOCK_H #define __LINUX_SPINLOCK_H #define __LINUX_INSIDE_SPINLOCK_H /* * include/linux/spinlock.h - generic spinlock/rwlock declarations * * here's the role of the various spinlock/rwlock related include files: * * on SMP builds: * * asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the * initializers * * linux/spinlock_types_raw: * The raw types and initializers * linux/spinlock_types.h: * defines the generic type and initializers * * asm/spinlock.h: contains the arch_spin_*()/etc. lowlevel * implementations, mostly inline assembly code * * (also included on UP-debug builds:) * * linux/spinlock_api_smp.h: * contains the prototypes for the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. * * on UP builds: * * linux/spinlock_type_up.h: * contains the generic, simplified UP spinlock type. * (which is an empty structure on non-debug builds) * * linux/spinlock_types_raw: * The raw RT types and initializers * linux/spinlock_types.h: * defines the generic type and initializers * * linux/spinlock_up.h: * contains the arch_spin_*()/etc. version of UP * builds. (which are NOPs on non-debug, non-preempt * builds) * * (included on UP-non-debug builds:) * * linux/spinlock_api_up.h: * builds the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. */ #include <linux/typecheck.h> #include <linux/preempt.h> #include <linux/linkage.h> #include <linux/compiler.h> #include <linux/irqflags.h> #include <linux/thread_info.h> #include <linux/stringify.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <linux/cleanup.h> #include <asm/barrier.h> #include <asm/mmiowb.h> /* * Must define these before including other files, inline functions need them */ #define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME #define LOCK_SECTION_START(extra) \ ".subsection 1\n\t" \ extra \ ".ifndef " LOCK_SECTION_NAME "\n\t" \ LOCK_SECTION_NAME ":\n\t" \ ".endif\n" #define LOCK_SECTION_END \ ".previous\n\t" #define __lockfunc __section(".spinlock.text") /* * Pull the arch_spinlock_t and arch_rwlock_t definitions: */ #include <linux/spinlock_types.h> /* * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them): */ #ifdef CONFIG_SMP # include <asm/spinlock.h> #else # include <linux/spinlock_up.h> #endif #ifdef CONFIG_DEBUG_SPINLOCK extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key, short inner); # define raw_spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN); \ } while (0) #else # define raw_spin_lock_init(lock) \ do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0) #endif #define raw_spin_is_locked(lock) arch_spin_is_locked(&(lock)->raw_lock) #ifdef arch_spin_is_contended #define raw_spin_is_contended(lock) arch_spin_is_contended(&(lock)->raw_lock) #else #define raw_spin_is_contended(lock) (((void)(lock), 0)) #endif /*arch_spin_is_contended*/ /* * smp_mb__after_spinlock() provides the equivalent of a full memory barrier * between program-order earlier lock acquisitions and program-order later * memory accesses. * * This guarantees that the following two properties hold: * * 1) Given the snippet: * * { X = 0; Y = 0; } * * CPU0 CPU1 * * WRITE_ONCE(X, 1); WRITE_ONCE(Y, 1); * spin_lock(S); smp_mb(); * smp_mb__after_spinlock(); r1 = READ_ONCE(X); * r0 = READ_ONCE(Y); * spin_unlock(S); * * it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0) * and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments * preceding the call to smp_mb__after_spinlock() in __schedule() and in * try_to_wake_up(). * * 2) Given the snippet: * * { X = 0; Y = 0; } * * CPU0 CPU1 CPU2 * * spin_lock(S); spin_lock(S); r1 = READ_ONCE(Y); * WRITE_ONCE(X, 1); smp_mb__after_spinlock(); smp_rmb(); * spin_unlock(S); r0 = READ_ONCE(X); r2 = READ_ONCE(X); * WRITE_ONCE(Y, 1); * spin_unlock(S); * * it is forbidden that CPU0's critical section executes before CPU1's * critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1) * and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments * preceding the calls to smp_rmb() in try_to_wake_up() for similar * snippets but "projected" onto two CPUs. * * Property (2) upgrades the lock to an RCsc lock. * * Since most load-store architectures implement ACQUIRE with an smp_mb() after * the LL/SC loop, they need no further barriers. Similarly all our TSO * architectures imply an smp_mb() for each atomic instruction and equally don't * need more. * * Architectures that can implement ACQUIRE better need to take care. */ #ifndef smp_mb__after_spinlock #define smp_mb__after_spinlock() kcsan_mb() #endif #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); extern int do_raw_spin_trylock(raw_spinlock_t *lock); extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock); #else static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) { __acquire(lock); arch_spin_lock(&lock->raw_lock); mmiowb_spin_lock(); } static inline int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&(lock)->raw_lock); if (ret) mmiowb_spin_lock(); return ret; } static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) { mmiowb_spin_unlock(); arch_spin_unlock(&lock->raw_lock); __release(lock); } #endif /* * Define the various spin_lock methods. Note we define these * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The * various methods are defined as nops in the case they are not * required. */ #define raw_spin_trylock(lock) __cond_lock(lock, _raw_spin_trylock(lock)) #define raw_spin_lock(lock) _raw_spin_lock(lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock_nested(lock, subclass) # define raw_spin_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\ _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #else /* * Always evaluate the 'subclass' argument to avoid that the compiler * warns about set-but-not-used variables when building with * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1. */ # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock(((void)(subclass), (lock))) # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock) #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ } while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave_nested(lock, subclass); \ } while (0) #else #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ typecheck(unsigned long, flags); \ flags = _raw_spin_lock_irqsave(lock); \ } while (0) #endif #else #define raw_spin_lock_irqsave(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_lock_irqsave(lock, flags); \ } while (0) #define raw_spin_lock_irqsave_nested(lock, flags, subclass) \ raw_spin_lock_irqsave(lock, flags) #endif #define raw_spin_lock_irq(lock) _raw_spin_lock_irq(lock) #define raw_spin_lock_bh(lock) _raw_spin_lock_bh(lock) #define raw_spin_unlock(lock) _raw_spin_unlock(lock) #define raw_spin_unlock_irq(lock) _raw_spin_unlock_irq(lock) #define raw_spin_unlock_irqrestore(lock, flags) \ do { \ typecheck(unsigned long, flags); \ _raw_spin_unlock_irqrestore(lock, flags); \ } while (0) #define raw_spin_unlock_bh(lock) _raw_spin_unlock_bh(lock) #define raw_spin_trylock_bh(lock) \ __cond_lock(lock, _raw_spin_trylock_bh(lock)) #define raw_spin_trylock_irq(lock) \ ({ \ local_irq_disable(); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_enable(); 0; }); \ }) #define raw_spin_trylock_irqsave(lock, flags) \ ({ \ local_irq_save(flags); \ raw_spin_trylock(lock) ? \ 1 : ({ local_irq_restore(flags); 0; }); \ }) #ifndef CONFIG_PREEMPT_RT /* Include rwlock functions for !RT */ #include <linux/rwlock.h> #endif /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: */ #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) # include <linux/spinlock_api_smp.h> #else # include <linux/spinlock_api_up.h> #endif /* Non PREEMPT_RT kernel, map to raw spinlocks: */ #ifndef CONFIG_PREEMPT_RT /* * Map the spin_lock functions to the raw variants for PREEMPT_RT=n */ static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock) { return &lock->rlock; } #ifdef CONFIG_DEBUG_SPINLOCK # define spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ __raw_spin_lock_init(spinlock_check(lock), \ #lock, &__key, LD_WAIT_CONFIG); \ } while (0) #else # define spin_lock_init(_lock) \ do { \ spinlock_check(_lock); \ *(_lock) = __SPIN_LOCK_UNLOCKED(_lock); \ } while (0) #endif static __always_inline void spin_lock(spinlock_t *lock) { raw_spin_lock(&lock->rlock); } static __always_inline void spin_lock_bh(spinlock_t *lock) { raw_spin_lock_bh(&lock->rlock); } static __always_inline int spin_trylock(spinlock_t *lock) { return raw_spin_trylock(&lock->rlock); } #define spin_lock_nested(lock, subclass) \ do { \ raw_spin_lock_nested(spinlock_check(lock), subclass); \ } while (0) #define spin_lock_nest_lock(lock, nest_lock) \ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ } while (0) static __always_inline void spin_lock_irq(spinlock_t *lock) { raw_spin_lock_irq(&lock->rlock); } #define spin_lock_irqsave(lock, flags) \ do { \ raw_spin_lock_irqsave(spinlock_check(lock), flags); \ } while (0) #define spin_lock_irqsave_nested(lock, flags, subclass) \ do { \ raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \ } while (0) static __always_inline void spin_unlock(spinlock_t *lock) { raw_spin_unlock(&lock->rlock); } static __always_inline void spin_unlock_bh(spinlock_t *lock) { raw_spin_unlock_bh(&lock->rlock); } static __always_inline void spin_unlock_irq(spinlock_t *lock) { raw_spin_unlock_irq(&lock->rlock); } static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { raw_spin_unlock_irqrestore(&lock->rlock, flags); } static __always_inline int spin_trylock_bh(spinlock_t *lock) { return raw_spin_trylock_bh(&lock->rlock); } static __always_inline int spin_trylock_irq(spinlock_t *lock) { return raw_spin_trylock_irq(&lock->rlock); } #define spin_trylock_irqsave(lock, flags) \ ({ \ raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ }) /** * spin_is_locked() - Check whether a spinlock is locked. * @lock: Pointer to the spinlock. * * This function is NOT required to provide any memory ordering * guarantees; it could be used for debugging purposes or, when * additional synchronization is needed, accompanied with other * constructs (memory barriers) enforcing the synchronization. * * Returns: 1 if @lock is locked, 0 otherwise. * * Note that the function only tells you that the spinlock is * seen to be locked, not that it is locked on your CPU. * * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n, * the return value is always 0 (see include/linux/spinlock_up.h). * Therefore you should not rely heavily on the return value. */ static __always_inline int spin_is_locked(spinlock_t *lock) { return raw_spin_is_locked(&lock->rlock); } static __always_inline int spin_is_contended(spinlock_t *lock) { return raw_spin_is_contended(&lock->rlock); } #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) #else /* !CONFIG_PREEMPT_RT */ # include <linux/spinlock_rt.h> #endif /* CONFIG_PREEMPT_RT */ /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPTION, * but a general need for low latency) */ static inline int spin_needbreak(spinlock_t *lock) { #ifdef CONFIG_PREEMPTION return spin_is_contended(lock); #else return 0; #endif } /* * Check if a rwlock is contended. * Returns non-zero if there is another task waiting on the rwlock. * Returns zero if the lock is not contended or the system / underlying * rwlock implementation does not support contention detection. * Technically does not depend on CONFIG_PREEMPTION, but a general need * for low latency. */ static inline int rwlock_needbreak(rwlock_t *lock) { #ifdef CONFIG_PREEMPTION return rwlock_is_contended(lock); #else return 0; #endif } /* * Pull the atomic_t declaration: * (asm-mips/atomic.h needs above definitions) */ #include <linux/atomic.h> /** * atomic_dec_and_lock - lock on reaching reference count zero * @atomic: the atomic counter * @lock: the spinlock in question * * Decrements @atomic by 1. If the result is 0, returns true and locks * @lock. Returns false for all other cases. */ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); #define atomic_dec_and_lock(atomic, lock) \ __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, unsigned long *flags); #define atomic_dec_and_lock_irqsave(atomic, lock, flags) \ __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))) extern int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock); #define atomic_dec_and_raw_lock(atomic, lock) \ __cond_lock(lock, _atomic_dec_and_raw_lock(atomic, lock)) extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock, unsigned long *flags); #define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) \ __cond_lock(lock, _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags))) int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, size_t max_size, unsigned int cpu_mult, gfp_t gfp, const char *name, struct lock_class_key *key); #define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp) \ ({ \ static struct lock_class_key key; \ int ret; \ \ ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size, \ cpu_mult, gfp, #locks, &key); \ ret; \ }) void free_bucket_spinlocks(spinlock_t *locks); DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t, raw_spin_lock(_T->lock), raw_spin_unlock(_T->lock)) DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock)) DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t, raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING), raw_spin_unlock(_T->lock)) DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t, raw_spin_lock_irq(_T->lock), raw_spin_unlock_irq(_T->lock)) DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock)) DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t, raw_spin_lock_irqsave(_T->lock, _T->flags), raw_spin_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try, raw_spin_trylock_irqsave(_T->lock, _T->flags)) DEFINE_LOCK_GUARD_1(spinlock, spinlock_t, spin_lock(_T->lock), spin_unlock(_T->lock)) DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock)) DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t, spin_lock_irq(_T->lock), spin_unlock_irq(_T->lock)) DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try, spin_trylock_irq(_T->lock)) DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t, spin_lock_irqsave(_T->lock, _T->flags), spin_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try, spin_trylock_irqsave(_T->lock, _T->flags)) DEFINE_LOCK_GUARD_1(read_lock, rwlock_t, read_lock(_T->lock), read_unlock(_T->lock)) DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t, read_lock_irq(_T->lock), read_unlock_irq(_T->lock)) DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t, read_lock_irqsave(_T->lock, _T->flags), read_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) DEFINE_LOCK_GUARD_1(write_lock, rwlock_t, write_lock(_T->lock), write_unlock(_T->lock)) DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t, write_lock_irq(_T->lock), write_unlock_irq(_T->lock)) DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t, write_lock_irqsave(_T->lock, _T->flags), write_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) #undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 // SPDX-License-Identifier: GPL-2.0-only /* * A driver for the Griffin Technology, Inc. "PowerMate" USB controller dial. * * v1.1, (c)2002 William R Sowerbutts <will@sowerbutts.com> * * This device is a anodised aluminium knob which connects over USB. It can measure * clockwise and anticlockwise rotation. The dial also acts as a pushbutton with * a spring for automatic release. The base contains a pair of LEDs which illuminate * the translucent base. It rotates without limit and reports its relative rotation * back to the host when polled by the USB controller. * * Testing with the knob I have has shown that it measures approximately 94 "clicks" * for one full rotation. Testing with my High Speed Rotation Actuator (ok, it was * a variable speed cordless electric drill) has shown that the device can measure * speeds of up to 7 clicks either clockwise or anticlockwise between pollings from * the host. If it counts more than 7 clicks before it is polled, it will wrap back * to zero and start counting again. This was at quite high speed, however, almost * certainly faster than the human hand could turn it. Griffin say that it loses a * pulse or two on a direction change; the granularity is so fine that I never * noticed this in practice. * * The device's microcontroller can be programmed to set the LED to either a constant * intensity, or to a rhythmic pulsing. Several patterns and speeds are available. * * Griffin were very happy to provide documentation and free hardware for development. * * Some userspace tools are available on the web: http://sowerbutts.com/powermate/ * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/usb/input.h> #define POWERMATE_VENDOR 0x077d /* Griffin Technology, Inc. */ #define POWERMATE_PRODUCT_NEW 0x0410 /* Griffin PowerMate */ #define POWERMATE_PRODUCT_OLD 0x04AA /* Griffin soundKnob */ #define CONTOUR_VENDOR 0x05f3 /* Contour Design, Inc. */ #define CONTOUR_JOG 0x0240 /* Jog and Shuttle */ /* these are the command codes we send to the device */ #define SET_STATIC_BRIGHTNESS 0x01 #define SET_PULSE_ASLEEP 0x02 #define SET_PULSE_AWAKE 0x03 #define SET_PULSE_MODE 0x04 /* these refer to bits in the powermate_device's requires_update field. */ #define UPDATE_STATIC_BRIGHTNESS (1<<0) #define UPDATE_PULSE_ASLEEP (1<<1) #define UPDATE_PULSE_AWAKE (1<<2) #define UPDATE_PULSE_MODE (1<<3) /* at least two versions of the hardware exist, with differing payload sizes. the first three bytes always contain the "interesting" data in the relevant format. */ #define POWERMATE_PAYLOAD_SIZE_MAX 6 #define POWERMATE_PAYLOAD_SIZE_MIN 3 struct powermate_device { signed char *data; dma_addr_t data_dma; struct urb *irq, *config; struct usb_ctrlrequest *configcr; struct usb_device *udev; struct usb_interface *intf; struct input_dev *input; spinlock_t lock; int static_brightness; int pulse_speed; int pulse_table; int pulse_asleep; int pulse_awake; int requires_update; // physical settings which are out of sync char phys[64]; }; static char pm_name_powermate[] = "Griffin PowerMate"; static char pm_name_soundknob[] = "Griffin SoundKnob"; static void powermate_config_complete(struct urb *urb); /* Callback for data arriving from the PowerMate over the USB interrupt pipe */ static void powermate_irq(struct urb *urb) { struct powermate_device *pm = urb->context; struct device *dev = &pm->intf->dev; int retval; switch (urb->status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(dev, "%s - urb shutting down with status: %d\n", __func__, urb->status); return; default: dev_dbg(dev, "%s - nonzero urb status received: %d\n", __func__, urb->status); goto exit; } /* handle updates to device state */ input_report_key(pm->input, BTN_0, pm->data[0] & 0x01); input_report_rel(pm->input, REL_DIAL, pm->data[1]); input_sync(pm->input); exit: retval = usb_submit_urb (urb, GFP_ATOMIC); if (retval) dev_err(dev, "%s - usb_submit_urb failed with result: %d\n", __func__, retval); } /* Decide if we need to issue a control message and do so. Must be called with pm->lock taken */ static void powermate_sync_state(struct powermate_device *pm) { if (pm->requires_update == 0) return; /* no updates are required */ if (pm->config->status == -EINPROGRESS) return; /* an update is already in progress; it'll issue this update when it completes */ if (pm->requires_update & UPDATE_PULSE_ASLEEP){ pm->configcr->wValue = cpu_to_le16( SET_PULSE_ASLEEP ); pm->configcr->wIndex = cpu_to_le16( pm->pulse_asleep ? 1 : 0 ); pm->requires_update &= ~UPDATE_PULSE_ASLEEP; }else if (pm->requires_update & UPDATE_PULSE_AWAKE){ pm->configcr->wValue = cpu_to_le16( SET_PULSE_AWAKE ); pm->configcr->wIndex = cpu_to_le16( pm->pulse_awake ? 1 : 0 ); pm->requires_update &= ~UPDATE_PULSE_AWAKE; }else if (pm->requires_update & UPDATE_PULSE_MODE){ int op, arg; /* the powermate takes an operation and an argument for its pulse algorithm. the operation can be: 0: divide the speed 1: pulse at normal speed 2: multiply the speed the argument only has an effect for operations 0 and 2, and ranges between 1 (least effect) to 255 (maximum effect). thus, several states are equivalent and are coalesced into one state. we map this onto a range from 0 to 510, with: 0 -- 254 -- use divide (0 = slowest) 255 -- use normal speed 256 -- 510 -- use multiple (510 = fastest). Only values of 'arg' quite close to 255 are particularly useful/spectacular. */ if (pm->pulse_speed < 255) { op = 0; // divide arg = 255 - pm->pulse_speed; } else if (pm->pulse_speed > 255) { op = 2; // multiply arg = pm->pulse_speed - 255; } else { op = 1; // normal speed arg = 0; // can be any value } pm->configcr->wValue = cpu_to_le16( (pm->pulse_table << 8) | SET_PULSE_MODE ); pm->configcr->wIndex = cpu_to_le16( (arg << 8) | op ); pm->requires_update &= ~UPDATE_PULSE_MODE; } else if (pm->requires_update & UPDATE_STATIC_BRIGHTNESS) { pm->configcr->wValue = cpu_to_le16( SET_STATIC_BRIGHTNESS ); pm->configcr->wIndex = cpu_to_le16( pm->static_brightness ); pm->requires_update &= ~UPDATE_STATIC_BRIGHTNESS; } else { printk(KERN_ERR "powermate: unknown update required"); pm->requires_update = 0; /* fudge the bug */ return; } /* printk("powermate: %04x %04x\n", pm->configcr->wValue, pm->configcr->wIndex); */ pm->configcr->bRequestType = 0x41; /* vendor request */ pm->configcr->bRequest = 0x01; pm->configcr->wLength = 0; usb_fill_control_urb(pm->config, pm->udev, usb_sndctrlpipe(pm->udev, 0), (void *) pm->configcr, NULL, 0, powermate_config_complete, pm); if (usb_submit_urb(pm->config, GFP_ATOMIC)) printk(KERN_ERR "powermate: usb_submit_urb(config) failed"); } /* Called when our asynchronous control message completes. We may need to issue another immediately */ static void powermate_config_complete(struct urb *urb) { struct powermate_device *pm = urb->context; unsigned long flags; if (urb->status) printk(KERN_ERR "powermate: config urb returned %d\n", urb->status); spin_lock_irqsave(&pm->lock, flags); powermate_sync_state(pm); spin_unlock_irqrestore(&pm->lock, flags); } /* Set the LED up as described and begin the sync with the hardware if required */ static void powermate_pulse_led(struct powermate_device *pm, int static_brightness, int pulse_speed, int pulse_table, int pulse_asleep, int pulse_awake) { unsigned long flags; if (pulse_speed < 0) pulse_speed = 0; if (pulse_table < 0) pulse_table = 0; if (pulse_speed > 510) pulse_speed = 510; if (pulse_table > 2) pulse_table = 2; pulse_asleep = !!pulse_asleep; pulse_awake = !!pulse_awake; spin_lock_irqsave(&pm->lock, flags); /* mark state updates which are required */ if (static_brightness != pm->static_brightness) { pm->static_brightness = static_brightness; pm->requires_update |= UPDATE_STATIC_BRIGHTNESS; } if (pulse_asleep != pm->pulse_asleep) { pm->pulse_asleep = pulse_asleep; pm->requires_update |= (UPDATE_PULSE_ASLEEP | UPDATE_STATIC_BRIGHTNESS); } if (pulse_awake != pm->pulse_awake) { pm->pulse_awake = pulse_awake; pm->requires_update |= (UPDATE_PULSE_AWAKE | UPDATE_STATIC_BRIGHTNESS); } if (pulse_speed != pm->pulse_speed || pulse_table != pm->pulse_table) { pm->pulse_speed = pulse_speed; pm->pulse_table = pulse_table; pm->requires_update |= UPDATE_PULSE_MODE; } powermate_sync_state(pm); spin_unlock_irqrestore(&pm->lock, flags); } /* Callback from the Input layer when an event arrives from userspace to configure the LED */ static int powermate_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int _value) { unsigned int command = (unsigned int)_value; struct powermate_device *pm = input_get_drvdata(dev); if (type == EV_MSC && code == MSC_PULSELED){ /* bits 0- 7: 8 bits: LED brightness bits 8-16: 9 bits: pulsing speed modifier (0 ... 510); 0-254 = slower, 255 = standard, 256-510 = faster. bits 17-18: 2 bits: pulse table (0, 1, 2 valid) bit 19: 1 bit : pulse whilst asleep? bit 20: 1 bit : pulse constantly? */ int static_brightness = command & 0xFF; // bits 0-7 int pulse_speed = (command >> 8) & 0x1FF; // bits 8-16 int pulse_table = (command >> 17) & 0x3; // bits 17-18 int pulse_asleep = (command >> 19) & 0x1; // bit 19 int pulse_awake = (command >> 20) & 0x1; // bit 20 powermate_pulse_led(pm, static_brightness, pulse_speed, pulse_table, pulse_asleep, pulse_awake); } return 0; } static int powermate_alloc_buffers(struct usb_device *udev, struct powermate_device *pm) { pm->data = usb_alloc_coherent(udev, POWERMATE_PAYLOAD_SIZE_MAX, GFP_KERNEL, &pm->data_dma); if (!pm->data) return -1; pm->configcr = kmalloc(sizeof(*(pm->configcr)), GFP_KERNEL); if (!pm->configcr) return -ENOMEM; return 0; } static void powermate_free_buffers(struct usb_device *udev, struct powermate_device *pm) { usb_free_coherent(udev, POWERMATE_PAYLOAD_SIZE_MAX, pm->data, pm->data_dma); kfree(pm->configcr); } /* Called whenever a USB device matching one in our supported devices table is connected */ static int powermate_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev (intf); struct usb_host_interface *interface; struct usb_endpoint_descriptor *endpoint; struct powermate_device *pm; struct input_dev *input_dev; int pipe, maxp; int error = -ENOMEM; interface = intf->cur_altsetting; if (interface->desc.bNumEndpoints < 1) return -EINVAL; endpoint = &interface->endpoint[0].desc; if (!usb_endpoint_is_int_in(endpoint)) return -EIO; usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x0a, USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, interface->desc.bInterfaceNumber, NULL, 0, USB_CTRL_SET_TIMEOUT); pm = kzalloc(sizeof(struct powermate_device), GFP_KERNEL); input_dev = input_allocate_device(); if (!pm || !input_dev) goto fail1; if (powermate_alloc_buffers(udev, pm)) goto fail2; pm->irq = usb_alloc_urb(0, GFP_KERNEL); if (!pm->irq) goto fail2; pm->config = usb_alloc_urb(0, GFP_KERNEL); if (!pm->config) goto fail3; pm->udev = udev; pm->intf = intf; pm->input = input_dev; usb_make_path(udev, pm->phys, sizeof(pm->phys)); strlcat(pm->phys, "/input0", sizeof(pm->phys)); spin_lock_init(&pm->lock); switch (le16_to_cpu(udev->descriptor.idProduct)) { case POWERMATE_PRODUCT_NEW: input_dev->name = pm_name_powermate; break; case POWERMATE_PRODUCT_OLD: input_dev->name = pm_name_soundknob; break; default: input_dev->name = pm_name_soundknob; printk(KERN_WARNING "powermate: unknown product id %04x\n", le16_to_cpu(udev->descriptor.idProduct)); } input_dev->phys = pm->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &intf->dev; input_set_drvdata(input_dev, pm); input_dev->event = powermate_input_event; input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REL) | BIT_MASK(EV_MSC); input_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); input_dev->relbit[BIT_WORD(REL_DIAL)] = BIT_MASK(REL_DIAL); input_dev->mscbit[BIT_WORD(MSC_PULSELED)] = BIT_MASK(MSC_PULSELED); /* get a handle to the interrupt data pipe */ pipe = usb_rcvintpipe(udev, endpoint->bEndpointAddress); maxp = usb_maxpacket(udev, pipe); if (maxp < POWERMATE_PAYLOAD_SIZE_MIN || maxp > POWERMATE_PAYLOAD_SIZE_MAX) { printk(KERN_WARNING "powermate: Expected payload of %d--%d bytes, found %d bytes!\n", POWERMATE_PAYLOAD_SIZE_MIN, POWERMATE_PAYLOAD_SIZE_MAX, maxp); maxp = POWERMATE_PAYLOAD_SIZE_MAX; } usb_fill_int_urb(pm->irq, udev, pipe, pm->data, maxp, powermate_irq, pm, endpoint->bInterval); pm->irq->transfer_dma = pm->data_dma; pm->irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; /* register our interrupt URB with the USB system */ if (usb_submit_urb(pm->irq, GFP_KERNEL)) { error = -EIO; goto fail4; } error = input_register_device(pm->input); if (error) goto fail5; /* force an update of everything */ pm->requires_update = UPDATE_PULSE_ASLEEP | UPDATE_PULSE_AWAKE | UPDATE_PULSE_MODE | UPDATE_STATIC_BRIGHTNESS; powermate_pulse_led(pm, 0x80, 255, 0, 1, 0); // set default pulse parameters usb_set_intfdata(intf, pm); return 0; fail5: usb_kill_urb(pm->irq); fail4: usb_free_urb(pm->config); fail3: usb_free_urb(pm->irq); fail2: powermate_free_buffers(udev, pm); fail1: input_free_device(input_dev); kfree(pm); return error; } /* Called when a USB device we've accepted ownership of is removed */ static void powermate_disconnect(struct usb_interface *intf) { struct powermate_device *pm = usb_get_intfdata (intf); usb_set_intfdata(intf, NULL); if (pm) { pm->requires_update = 0; usb_kill_urb(pm->irq); input_unregister_device(pm->input); usb_kill_urb(pm->config); usb_free_urb(pm->irq); usb_free_urb(pm->config); powermate_free_buffers(interface_to_usbdev(intf), pm); kfree(pm); } } static const struct usb_device_id powermate_devices[] = { { USB_DEVICE(POWERMATE_VENDOR, POWERMATE_PRODUCT_NEW) }, { USB_DEVICE(POWERMATE_VENDOR, POWERMATE_PRODUCT_OLD) }, { USB_DEVICE(CONTOUR_VENDOR, CONTOUR_JOG) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE (usb, powermate_devices); static struct usb_driver powermate_driver = { .name = "powermate", .probe = powermate_probe, .disconnect = powermate_disconnect, .id_table = powermate_devices, }; module_usb_driver(powermate_driver); MODULE_AUTHOR( "William R Sowerbutts" ); MODULE_DESCRIPTION( "Griffin Technology, Inc PowerMate driver" ); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> (C) 2002 David Woodhouse <dwmw2@infradead.org> (C) 2012 Michel Lespinasse <walken@google.com> linux/include/linux/rbtree_augmented.h */ #ifndef _LINUX_RBTREE_AUGMENTED_H #define _LINUX_RBTREE_AUGMENTED_H #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> /* * Please note - only struct rb_augment_callbacks and the prototypes for * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { void (*propagate)(struct rb_node *node, struct rb_node *stop); void (*copy)(struct rb_node *old, struct rb_node *new); void (*rotate)(struct rb_node *old, struct rb_node *new); }; extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and * rb_insert_augmented() instead of the usual rb_insert_color() call. * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { __rb_insert_augmented(node, root, augment->rotate); } static inline void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { if (newleft) root->rb_leftmost = node; rb_insert_augmented(node, &root->rb_root, augment); } static __always_inline struct rb_node * rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *), const struct rb_augment_callbacks *augment) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); augment->propagate(parent, NULL); /* suboptimal */ rb_insert_augmented_cached(node, tree, leftmost, augment); return leftmost ? node : NULL; } /* * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ #define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ if (RBCOMPUTE(node, true)) \ break; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ .copy = RBNAME ## _copy, \ .rotate = RBNAME ## _rotate \ }; /* * Template for declaring augmented rbtree callbacks, * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBTYPE: type of the RBAUGMENTED field * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar */ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ if (node->RBFIELD.rb_left) { \ child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (node->RBFIELD.rb_right) { \ child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (exit && node->RBAUGMENTED == max) \ return true; \ node->RBAUGMENTED = max; \ return false; \ } \ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 #define RB_BLACK 1 #define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) #define __rb_color(pc) ((pc) & 1) #define __rb_is_black(pc) __rb_color(pc) #define __rb_is_red(pc) (!__rb_color(pc)) #define rb_color(rb) __rb_color((rb)->__rb_parent_color) #define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) #define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { rb->__rb_parent_color = (unsigned long)p + color; } static inline void __rb_change_child(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) WRITE_ONCE(parent->rb_left, new); else WRITE_ONCE(parent->rb_right, new); } else WRITE_ONCE(root->rb_node, new); } static inline void __rb_change_child_rcu(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) rcu_assign_pointer(parent->rb_left, new); else rcu_assign_pointer(parent->rb_right, new); } else rcu_assign_pointer(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) * * Note that if there is one child it must be red due to 5) * and node must be black due to 4). We adjust colors locally * so as to bypass __rb_erase_color() later on. */ pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, child, parent, root); if (child) { child->__rb_parent_color = pc; rebalance = NULL; } else rebalance = __rb_is_black(pc) ? parent : NULL; tmp = parent; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ tmp->__rb_parent_color = pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, tmp, parent, root); rebalance = NULL; tmp = parent; } else { struct rb_node *successor = child, *child2; tmp = child->rb_left; if (!tmp) { /* * Case 2: node's successor is its right child * * (n) (s) * / \ / \ * (x) (s) -> (x) (c) * \ * (c) */ parent = successor; child2 = successor->rb_right; augment->copy(node, successor); } else { /* * Case 3: node's successor is leftmost under * node's right child subtree * * (n) (s) * / \ / \ * (x) (y) -> (x) (y) * / / * (p) (p) * / / * (s) (c) * \ * (c) */ do { parent = successor; successor = tmp; tmp = tmp->rb_left; } while (tmp); child2 = successor->rb_right; WRITE_ONCE(parent->rb_left, child2); WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); augment->copy(node, successor); augment->propagate(parent, successor); } tmp = node->rb_left; WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); if (child2) { rb_set_parent_color(child2, parent, RB_BLACK); rebalance = NULL; } else { rebalance = rb_is_black(successor) ? parent : NULL; } successor->__rb_parent_color = pc; tmp = successor; } augment->propagate(tmp, NULL); return rebalance; } static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { if (root->rb_leftmost == node) root->rb_leftmost = rb_next(node); rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _LINUX_RBTREE_AUGMENTED_H */
162 170 170 170 170 163 161 3 2 1 1 1 1 1 1 138 138 170 170 170 138 138 169 169 163 163 164 168 167 164 164 163 163 163 162 170 163 164 164 169 170 170 170 149 147 32 32 170 169 170 168 169 170 8 1 170 170 170 170 170 168 4 4 168 3 169 170 169 169 170 8 2 2 29 29 1 1 29 29 168 166 168 167 168 168 168 168 168 12 177 83 167 167 177 144 144 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 // SPDX-License-Identifier: GPL-2.0 /* * Kernel internal timers * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. * * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to * serialize accesses to xtime/lost_ticks). * Copyright (C) 1998 Andrea Arcangeli * 1999-03-10 Improved NTP compatibility by Ulrich Windl * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love * 2000-10-05 Implemented scalable SMP per-CPU timer handling. * Copyright (C) 2000, 2001, 2002 Ingo Molnar * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar */ #include <linux/kernel_stat.h> #include <linux/export.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/pid_namespace.h> #include <linux/notifier.h> #include <linux/thread_info.h> #include <linux/time.h> #include <linux/jiffies.h> #include <linux/posix-timers.h> #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> #include <linux/irq_work.h> #include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/slab.h> #include <linux/compat.h> #include <linux/random.h> #include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/div64.h> #include <asm/timex.h> #include <asm/io.h> #include "tick-internal.h" #include "timer_migration.h" #define CREATE_TRACE_POINTS #include <trace/events/timer.h> __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); /* * The timer wheel has LVL_DEPTH array levels. Each level provides an array of * LVL_SIZE buckets. Each level is driven by its own clock and therefore each * level has a different granularity. * * The level granularity is: LVL_CLK_DIV ^ level * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) * * The array level of a newly armed timer depends on the relative expiry * time. The farther the expiry time is away the higher the array level and * therefore the granularity becomes. * * Contrary to the original timer wheel implementation, which aims for 'exact' * expiry of the timers, this implementation removes the need for recascading * the timers into the lower array levels. The previous 'classic' timer wheel * implementation of the kernel already violated the 'exact' expiry by adding * slack to the expiry time to provide batched expiration. The granularity * levels provide implicit batching. * * This is an optimization of the original timer wheel implementation for the * majority of the timer wheel use cases: timeouts. The vast majority of * timeout timers (networking, disk I/O ...) are canceled before expiry. If * the timeout expires it indicates that normal operation is disturbed, so it * does not matter much whether the timeout comes with a slight delay. * * The only exception to this are networking timers with a small expiry * time. They rely on the granularity. Those fit into the first wheel level, * which has HZ granularity. * * We don't have cascading anymore. timers with a expiry time above the * capacity of the last wheel level are force expired at the maximum timeout * value of the last wheel level. From data sampling we know that the maximum * value observed is 5 days (network connection tracking), so this should not * be an issue. * * The currently chosen array constants values are a good compromise between * array size and granularity. * * This results in the following granularity and range levels: * * HZ 1000 steps * Level Offset Granularity Range * 0 0 1 ms 0 ms - 63 ms * 1 64 8 ms 64 ms - 511 ms * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) * * HZ 300 * Level Offset Granularity Range * 0 0 3 ms 0 ms - 210 ms * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) * * HZ 250 * Level Offset Granularity Range * 0 0 4 ms 0 ms - 255 ms * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) * * HZ 100 * Level Offset Granularity Range * 0 0 10 ms 0 ms - 630 ms * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) */ /* Clock divisor for the next level */ #define LVL_CLK_SHIFT 3 #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) #define LVL_CLK_MASK (LVL_CLK_DIV - 1) #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) #define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) /* * The time start value for each level to select the bucket at enqueue * time. We start from the last possible delta of the previous level * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). */ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) /* Size of each clock level */ #define LVL_BITS 6 #define LVL_SIZE (1UL << LVL_BITS) #define LVL_MASK (LVL_SIZE - 1) #define LVL_OFFS(n) ((n) * LVL_SIZE) /* Level depth */ #if HZ > 100 # define LVL_DEPTH 9 # else # define LVL_DEPTH 8 #endif /* The cutoff (max. capacity of the wheel) */ #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) /* * The resulting wheel size. If NOHZ is configured we allocate two * wheels so we have a separate storage for the deferrable timers. */ #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) #ifdef CONFIG_NO_HZ_COMMON /* * If multiple bases need to be locked, use the base ordering for lock * nesting, i.e. lowest number first. */ # define NR_BASES 3 # define BASE_LOCAL 0 # define BASE_GLOBAL 1 # define BASE_DEF 2 #else # define NR_BASES 1 # define BASE_LOCAL 0 # define BASE_GLOBAL 0 # define BASE_DEF 0 #endif /** * struct timer_base - Per CPU timer base (number of base depends on config) * @lock: Lock protecting the timer_base * @running_timer: When expiring timers, the lock is dropped. To make * sure not to race against deleting/modifying a * currently running timer, the pointer is set to the * timer, which expires at the moment. If no timer is * running, the pointer is NULL. * @expiry_lock: PREEMPT_RT only: Lock is taken in softirq around * timer expiry callback execution and when trying to * delete a running timer and it wasn't successful in * the first glance. It prevents priority inversion * when callback was preempted on a remote CPU and a * caller tries to delete the running timer. It also * prevents a life lock, when the task which tries to * delete a timer preempted the softirq thread which * is running the timer callback function. * @timer_waiters: PREEMPT_RT only: Tells, if there is a waiter * waiting for the end of the timer callback function * execution. * @clk: clock of the timer base; is updated before enqueue * of a timer; during expiry, it is 1 offset ahead of * jiffies to avoid endless requeuing to current * jiffies * @next_expiry: expiry value of the first timer; it is updated when * finding the next timer and during enqueue; the * value is not valid, when next_expiry_recalc is set * @cpu: Number of CPU the timer base belongs to * @next_expiry_recalc: States, whether a recalculation of next_expiry is * required. Value is set true, when a timer was * deleted. * @is_idle: Is set, when timer_base is idle. It is triggered by NOHZ * code. This state is only used in standard * base. Deferrable timers, which are enqueued remotely * never wake up an idle CPU. So no matter of supporting it * for this base. * @timers_pending: Is set, when a timer is pending in the base. It is only * reliable when next_expiry_recalc is not set. * @pending_map: bitmap of the timer wheel; each bit reflects a * bucket of the wheel. When a bit is set, at least a * single timer is enqueued in the related bucket. * @vectors: Array of lists; Each array member reflects a bucket * of the timer wheel. The list contains all timers * which are enqueued into a specific bucket. */ struct timer_base { raw_spinlock_t lock; struct timer_list *running_timer; #ifdef CONFIG_PREEMPT_RT spinlock_t expiry_lock; atomic_t timer_waiters; #endif unsigned long clk; unsigned long next_expiry; unsigned int cpu; bool next_expiry_recalc; bool is_idle; bool timers_pending; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); #ifdef CONFIG_NO_HZ_COMMON static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); static DEFINE_MUTEX(timer_keys_mutex); static void timer_update_keys(struct work_struct *work); static DECLARE_WORK(timer_update_work, timer_update_keys); #ifdef CONFIG_SMP static unsigned int sysctl_timer_migration = 1; DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); static void timers_update_migration(void) { if (sysctl_timer_migration && tick_nohz_active) static_branch_enable(&timers_migration_enabled); else static_branch_disable(&timers_migration_enabled); } #ifdef CONFIG_SYSCTL static int timer_migration_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&timer_keys_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) timers_update_migration(); mutex_unlock(&timer_keys_mutex); return ret; } static struct ctl_table timer_sysctl[] = { { .procname = "timer_migration", .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static int __init timer_sysctl_init(void) { register_sysctl("kernel", timer_sysctl); return 0; } device_initcall(timer_sysctl_init); #endif /* CONFIG_SYSCTL */ #else /* CONFIG_SMP */ static inline void timers_update_migration(void) { } #endif /* !CONFIG_SMP */ static void timer_update_keys(struct work_struct *work) { mutex_lock(&timer_keys_mutex); timers_update_migration(); static_branch_enable(&timers_nohz_active); mutex_unlock(&timer_keys_mutex); } void timers_update_nohz(void) { schedule_work(&timer_update_work); } static inline bool is_timers_nohz_active(void) { return static_branch_unlikely(&timers_nohz_active); } #else static inline bool is_timers_nohz_active(void) { return false; } #endif /* NO_HZ_COMMON */ static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) { int rem; unsigned long original = j; /* * We don't want all cpus firing their timers at once hitting the * same lock or cachelines, so we skew each extra cpu with an extra * 3 jiffies. This 3 jiffies came originally from the mm/ code which * already did this. * The skew is done by adding 3*cpunr, then round, then subtract this * extra offset again. */ j += cpu * 3; rem = j % HZ; /* * If the target jiffie is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. * But never round down if @force_up is set. */ if (rem < HZ/4 && !force_up) /* round down */ j = j - rem; else /* round up */ j = j - rem + HZ; /* now that we have rounded, subtract the extra skew again */ j -= cpu * 3; /* * Make sure j is still in the future. Otherwise return the * unmodified value. */ return time_is_after_jiffies(j) ? j : original; } /** * __round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * __round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The exact rounding is skewed for each processor to avoid all * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies(unsigned long j, int cpu) { return round_jiffies_common(j, cpu, false); } EXPORT_SYMBOL_GPL(__round_jiffies); /** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * __round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The exact rounding is skewed for each processor to avoid all * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies_relative(unsigned long j, int cpu) { unsigned long j0 = jiffies; /* Use j0 because jiffies might change while we run */ return round_jiffies_common(j + j0, cpu, false) - j0; } EXPORT_SYMBOL_GPL(__round_jiffies_relative); /** * round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * * round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies(unsigned long j) { return round_jiffies_common(j, raw_smp_processor_id(), false); } EXPORT_SYMBOL_GPL(round_jiffies); /** * round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * * round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies_relative(unsigned long j) { return __round_jiffies_relative(j, raw_smp_processor_id()); } EXPORT_SYMBOL_GPL(round_jiffies_relative); /** * __round_jiffies_up - function to round jiffies up to a full second * @j: the time in (absolute) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * This is the same as __round_jiffies() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long __round_jiffies_up(unsigned long j, int cpu) { return round_jiffies_common(j, cpu, true); } EXPORT_SYMBOL_GPL(__round_jiffies_up); /** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * This is the same as __round_jiffies_relative() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) { unsigned long j0 = jiffies; /* Use j0 because jiffies might change while we run */ return round_jiffies_common(j + j0, cpu, true) - j0; } EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); /** * round_jiffies_up - function to round jiffies up to a full second * @j: the time in (absolute) jiffies that should be rounded * * This is the same as round_jiffies() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long round_jiffies_up(unsigned long j) { return round_jiffies_common(j, raw_smp_processor_id(), true); } EXPORT_SYMBOL_GPL(round_jiffies_up); /** * round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * * This is the same as round_jiffies_relative() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long round_jiffies_up_relative(unsigned long j) { return __round_jiffies_up_relative(j, raw_smp_processor_id()); } EXPORT_SYMBOL_GPL(round_jiffies_up_relative); static inline unsigned int timer_get_idx(struct timer_list *timer) { return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; } static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) { timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | idx << TIMER_ARRAYSHIFT; } /* * Helper function to calculate the array index for a given expiry * time. */ static inline unsigned calc_index(unsigned long expires, unsigned lvl, unsigned long *bucket_expiry) { /* * The timer wheel has to guarantee that a timer does not fire * early. Early expiry can happen due to: * - Timer is armed at the edge of a tick * - Truncation of the expiry time in the outer wheel levels * * Round up with level granularity to prevent this. */ expires = (expires >> LVL_SHIFT(lvl)) + 1; *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } static int calc_wheel_index(unsigned long expires, unsigned long clk, unsigned long *bucket_expiry) { unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { idx = calc_index(expires, 0, bucket_expiry); } else if (delta < LVL_START(2)) { idx = calc_index(expires, 1, bucket_expiry); } else if (delta < LVL_START(3)) { idx = calc_index(expires, 2, bucket_expiry); } else if (delta < LVL_START(4)) { idx = calc_index(expires, 3, bucket_expiry); } else if (delta < LVL_START(5)) { idx = calc_index(expires, 4, bucket_expiry); } else if (delta < LVL_START(6)) { idx = calc_index(expires, 5, bucket_expiry); } else if (delta < LVL_START(7)) { idx = calc_index(expires, 6, bucket_expiry); } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { idx = calc_index(expires, 7, bucket_expiry); } else if ((long) delta < 0) { idx = clk & LVL_MASK; *bucket_expiry = clk; } else { /* * Force expire obscene large timeouts to expire at the * capacity limit of the wheel. */ if (delta >= WHEEL_TIMEOUT_CUTOFF) expires = clk + WHEEL_TIMEOUT_MAX; idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); } return idx; } static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { /* * Deferrable timers do not prevent the CPU from entering dynticks and * are not taken into account on the idle/nohz_full path. An IPI when a * new deferrable timer is enqueued will wake up the remote CPU but * nothing will be done with the deferrable timer base. Therefore skip * the remote IPI for deferrable timers completely. */ if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE) return; /* * We might have to IPI the remote CPU if the base is idle and the * timer is pinned. If it is a non pinned timer, it is only queued * on the remote CPU, when timer was running during queueing. Then * everything is handled by remote CPU anyway. If the other CPU is * on the way to idle then it can't set base->is_idle as we hold * the base lock: */ if (base->is_idle) { WARN_ON_ONCE(!(timer->flags & TIMER_PINNED || tick_nohz_full_cpu(base->cpu))); wake_up_nohz_cpu(base->cpu); } } /* * Enqueue the timer into the hash bucket, mark it pending in * the bitmap, store the index in the timer flags then wake up * the target CPU if needed. */ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, unsigned int idx, unsigned long bucket_expiry) { hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); trace_timer_start(timer, bucket_expiry); /* * Check whether this is the new first expiring timer. The * effective expiry time of the timer is required here * (bucket_expiry) instead of timer->expires. */ if (time_before(bucket_expiry, base->next_expiry)) { /* * Set the next expiry time and kick the CPU so it * can reevaluate the wheel: */ base->next_expiry = bucket_expiry; base->timers_pending = true; base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); } } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { unsigned long bucket_expiry; unsigned int idx; idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); enqueue_timer(base, timer, idx, bucket_expiry); } #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr timer_debug_descr; struct timer_hint { void (*function)(struct timer_list *t); long offset; }; #define TIMER_HINT(fn, container, timr, hintfn) \ { \ .function = fn, \ .offset = offsetof(container, hintfn) - \ offsetof(container, timr) \ } static const struct timer_hint timer_hints[] = { TIMER_HINT(delayed_work_timer_fn, struct delayed_work, timer, work.func), TIMER_HINT(kthread_delayed_work_timer_fn, struct kthread_delayed_work, timer, work.func), }; static void *timer_debug_hint(void *addr) { struct timer_list *timer = addr; int i; for (i = 0; i < ARRAY_SIZE(timer_hints); i++) { if (timer_hints[i].function == timer->function) { void (**fn)(void) = addr + timer_hints[i].offset; return *fn; } } return timer->function; } static bool timer_is_static_object(void *addr) { struct timer_list *timer = addr; return (timer->entry.pprev == NULL && timer->entry.next == TIMER_ENTRY_STATIC); } /* * timer_fixup_init is called when: * - an active object is initialized */ static bool timer_fixup_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_init(timer, &timer_debug_descr); return true; default: return false; } } /* Stub timer callback for improperly used timers. */ static void stub_timer(struct timer_list *unused) { WARN_ON(1); } /* * timer_fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: timer_setup(timer, stub_timer, 0); return true; case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * timer_fixup_free is called when: * - an active object is freed */ static bool timer_fixup_free(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_free(timer, &timer_debug_descr); return true; default: return false; } } /* * timer_fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: timer_setup(timer, stub_timer, 0); return true; default: return false; } } static const struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, .is_static_object = timer_is_static_object, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, .fixup_assert_init = timer_fixup_assert_init, }; static inline void debug_timer_init(struct timer_list *timer) { debug_object_init(timer, &timer_debug_descr); } static inline void debug_timer_activate(struct timer_list *timer) { debug_object_activate(timer, &timer_debug_descr); } static inline void debug_timer_deactivate(struct timer_list *timer) { debug_object_deactivate(timer, &timer_debug_descr); } static inline void debug_timer_assert_init(struct timer_list *timer) { debug_object_assert_init(timer, &timer_debug_descr); } static void do_init_timer(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); void init_timer_on_stack_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_object_init_on_stack(timer, &timer_debug_descr); do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL_GPL(init_timer_on_stack_key); void destroy_timer_on_stack(struct timer_list *timer) { debug_object_free(timer, &timer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_timer_on_stack); #else static inline void debug_timer_init(struct timer_list *timer) { } static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } static inline void debug_timer_assert_init(struct timer_list *timer) { } #endif static inline void debug_init(struct timer_list *timer) { debug_timer_init(timer); trace_timer_init(timer); } static inline void debug_deactivate(struct timer_list *timer) { debug_timer_deactivate(timer); trace_timer_cancel(timer); } static inline void debug_assert_init(struct timer_list *timer) { debug_timer_assert_init(timer); } static void do_init_timer(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { timer->entry.pprev = NULL; timer->function = func; if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS)) flags &= TIMER_INIT_FLAGS; timer->flags = flags | raw_smp_processor_id(); lockdep_init_map(&timer->lockdep_map, name, key, 0); } /** * init_timer_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags * @name: name of the timer * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * * init_timer_key() must be done to a timer prior to calling *any* of the * other timer functions. */ void init_timer_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL(init_timer_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct hlist_node *entry = &timer->entry; debug_deactivate(timer); __hlist_del(entry); if (clear_pending) entry->pprev = NULL; entry->next = LIST_POISON2; } static int detach_if_pending(struct timer_list *timer, struct timer_base *base, bool clear_pending) { unsigned idx = timer_get_idx(timer); if (!timer_pending(timer)) return 0; if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map); base->next_expiry_recalc = true; } detach_timer(timer, clear_pending); return 1; } static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; struct timer_base *base; base = per_cpu_ptr(&timer_bases[index], cpu); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); return base; } static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; struct timer_base *base; base = this_cpu_ptr(&timer_bases[index]); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = this_cpu_ptr(&timer_bases[BASE_DEF]); return base; } static inline struct timer_base *get_timer_base(u32 tflags) { return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } static inline void __forward_timer_base(struct timer_base *base, unsigned long basej) { /* * Check whether we can forward the base. We can only do that when * @basej is past base->clk otherwise we might rewind base->clk. */ if (time_before_eq(basej, base->clk)) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ if (time_after(base->next_expiry, basej)) { base->clk = basej; } else { if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) return; base->clk = base->next_expiry; } } static inline void forward_timer_base(struct timer_base *base) { __forward_timer_base(base, READ_ONCE(jiffies)); } /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means * that all timers which are tied to this base are locked, and the base itself * is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found in the base->vectors array. * * When a timer is migrating then the TIMER_MIGRATING flag is set and we need * to wait until the migration is done. */ static struct timer_base *lock_timer_base(struct timer_list *timer, unsigned long *flags) __acquires(timer->base->lock) { for (;;) { struct timer_base *base; u32 tf; /* * We need to use READ_ONCE() here, otherwise the compiler * might re-read @tf between the check for TIMER_MIGRATING * and spin_lock(). */ tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); raw_spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; raw_spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); } } #define MOD_TIMER_PENDING_ONLY 0x01 #define MOD_TIMER_REDUCE 0x02 #define MOD_TIMER_NOTPENDING 0x04 static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { unsigned long clk = 0, flags, bucket_expiry; struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; int ret = 0; debug_assert_init(timer); /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the * same array bucket then just return: */ if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { /* * The downside of this optimization is that it can result in * larger granularity than you would get from adding a new * timer with this expiry. */ long diff = timer->expires - expires; if (!diff) return 1; if (options & MOD_TIMER_REDUCE && diff <= 0) return 1; /* * We lock timer base and calculate the bucket index right * here. If the timer ends up in the same bucket, then we * just update the expiry time and avoid the whole * dequeue/enqueue dance. */ base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated * while holding base lock to prevent a race against the * shutdown code. */ if (!timer->function) goto out_unlock; forward_timer_base(base); if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && time_before_eq(timer->expires, expires)) { ret = 1; goto out_unlock; } clk = base->clk; idx = calc_wheel_index(expires, clk, &bucket_expiry); /* * Retrieve and compare the array index of the pending * timer. If it matches set the expiry to the new value so a * subsequent call will exit in the expires check above. */ if (idx == timer_get_idx(timer)) { if (!(options & MOD_TIMER_REDUCE)) timer->expires = expires; else if (time_after(timer->expires, expires)) timer->expires = expires; ret = 1; goto out_unlock; } } else { base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated * while holding base lock to prevent a race against the * shutdown code. */ if (!timer->function) goto out_unlock; forward_timer_base(base); } ret = detach_if_pending(timer, base, false); if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; new_base = get_timer_this_cpu_base(timer->flags); if (base != new_base) { /* * We are trying to schedule the timer on the new base. * However we can't change timer's base while it is running, * otherwise timer_delete_sync() can't detect that the timer's * handler yet has not finished. This also guarantees that the * timer is serialized wrt itself. */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ timer->flags |= TIMER_MIGRATING; raw_spin_unlock(&base->lock); base = new_base; raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); forward_timer_base(base); } } debug_timer_activate(timer); timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only * enqueue_timer() is required. Otherwise we need to (re)calculate * the wheel index via internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) enqueue_timer(base, timer, idx, bucket_expiry); else internal_add_timer(base, timer); out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } /** * mod_timer_pending - Modify a pending timer's timeout * @timer: The pending timer to be modified * @expires: New absolute timeout in jiffies * * mod_timer_pending() is the same for pending timers as mod_timer(), but * will not activate inactive timers. * * If @timer->function == NULL then the start operation is silently * discarded. * * Return: * * %0 - The timer was inactive and not modified or was in * shutdown state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); } EXPORT_SYMBOL(mod_timer_pending); /** * mod_timer - Modify a timer's timeout * @timer: The timer to be modified * @expires: New absolute timeout in jiffies * * mod_timer(timer, expires) is equivalent to: * * del_timer(timer); timer->expires = expires; add_timer(timer); * * mod_timer() is more efficient than the above open coded sequence. In * case that the timer is inactive, the del_timer() part is a NOP. The * timer is in any case activated with the new expiry time @expires. * * Note that if there are multiple unserialized concurrent users of the * same timer, then mod_timer() is the only safe way to modify the timeout, * since add_timer() cannot modify an already running timer. * * If @timer->function == NULL then the start operation is silently * discarded. In this case the return value is 0 and meaningless. * * Return: * * %0 - The timer was inactive and started or was in shutdown * state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires or * the timer was active and not modified because @expires did * not change the effective expiry time */ int mod_timer(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, 0); } EXPORT_SYMBOL(mod_timer); /** * timer_reduce - Modify a timer's timeout if it would reduce the timeout * @timer: The timer to be modified * @expires: New absolute timeout in jiffies * * timer_reduce() is very similar to mod_timer(), except that it will only * modify an enqueued timer if that would reduce the expiration time. If * @timer is not enqueued it starts the timer. * * If @timer->function == NULL then the start operation is silently * discarded. * * Return: * * %0 - The timer was inactive and started or was in shutdown * state and the operation was discarded * * %1 - The timer was active and requeued to expire at @expires or * the timer was active and not modified because @expires * did not change the effective expiry time such that the * timer would expire earlier than already scheduled */ int timer_reduce(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, MOD_TIMER_REDUCE); } EXPORT_SYMBOL(timer_reduce); /** * add_timer - Start a timer * @timer: The timer to be started * * Start @timer to expire at @timer->expires in the future. @timer->expires * is the absolute expiry time measured in 'jiffies'. When the timer expires * timer->function(timer) will be invoked from soft interrupt context. * * The @timer->expires and @timer->function fields must be set prior * to calling this function. * * If @timer->function == NULL then the start operation is silently * discarded. * * If @timer->expires is already in the past @timer will be queued to * expire at the next timer tick. * * This can only operate on an inactive timer. Attempts to invoke this on * an active timer are rejected with a warning. */ void add_timer(struct timer_list *timer) { if (WARN_ON_ONCE(timer_pending(timer))) return; __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer); /** * add_timer_local() - Start a timer on the local CPU * @timer: The timer to be started * * Same as add_timer() except that the timer flag TIMER_PINNED is set. * * See add_timer() for further details. */ void add_timer_local(struct timer_list *timer) { if (WARN_ON_ONCE(timer_pending(timer))) return; timer->flags |= TIMER_PINNED; __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer_local); /** * add_timer_global() - Start a timer without TIMER_PINNED flag set * @timer: The timer to be started * * Same as add_timer() except that the timer flag TIMER_PINNED is unset. * * See add_timer() for further details. */ void add_timer_global(struct timer_list *timer) { if (WARN_ON_ONCE(timer_pending(timer))) return; timer->flags &= ~TIMER_PINNED; __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer_global); /** * add_timer_on - Start a timer on a particular CPU * @timer: The timer to be started * @cpu: The CPU to start it on * * Same as add_timer() except that it starts the timer on the given CPU and * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in * the next round, add_timer_global() should be used instead as it unsets * the TIMER_PINNED flag. * * See add_timer() for further details. */ void add_timer_on(struct timer_list *timer, int cpu) { struct timer_base *new_base, *base; unsigned long flags; debug_assert_init(timer); if (WARN_ON_ONCE(timer_pending(timer))) return; /* Make sure timer flags have TIMER_PINNED flag set */ timer->flags |= TIMER_PINNED; new_base = get_timer_cpu_base(timer->flags, cpu); /* * If @timer was on a different CPU, it should be migrated with the * old base locked to prevent other operations proceeding with the * wrong base locked. See lock_timer_base(). */ base = lock_timer_base(timer, &flags); /* * Has @timer been shutdown? This needs to be evaluated while * holding base lock to prevent a race against the shutdown code. */ if (!timer->function) goto out_unlock; if (base != new_base) { timer->flags |= TIMER_MIGRATING; raw_spin_unlock(&base->lock); base = new_base; raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } forward_timer_base(base); debug_timer_activate(timer); internal_add_timer(base, timer); out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); /** * __timer_delete - Internal function: Deactivate a timer * @timer: The timer to be deactivated * @shutdown: If true, this indicates that the timer is about to be * shutdown permanently. * * If @shutdown is true then @timer->function is set to NULL under the * timer base lock which prevents further rearming of the time. In that * case any attempt to rearm @timer after this function returns will be * silently ignored. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ static int __timer_delete(struct timer_list *timer, bool shutdown) { struct timer_base *base; unsigned long flags; int ret = 0; debug_assert_init(timer); /* * If @shutdown is set then the lock has to be taken whether the * timer is pending or not to protect against a concurrent rearm * which might hit between the lockless pending check and the lock * acquisition. By taking the lock it is ensured that such a newly * enqueued timer is dequeued and cannot end up with * timer->function == NULL in the expiry code. * * If timer->function is currently executed, then this makes sure * that the callback cannot requeue the timer. */ if (timer_pending(timer) || shutdown) { base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, true); if (shutdown) timer->function = NULL; raw_spin_unlock_irqrestore(&base->lock, flags); } return ret; } /** * timer_delete - Deactivate a timer * @timer: The timer to be deactivated * * The function only deactivates a pending timer, but contrary to * timer_delete_sync() it does not take into account whether the timer's * callback function is concurrently executed on a different CPU or not. * It neither prevents rearming of the timer. If @timer can be rearmed * concurrently then the return value of this function is meaningless. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ int timer_delete(struct timer_list *timer) { return __timer_delete(timer, false); } EXPORT_SYMBOL(timer_delete); /** * timer_shutdown - Deactivate a timer and prevent rearming * @timer: The timer to be deactivated * * The function does not wait for an eventually running timer callback on a * different CPU but it prevents rearming of the timer. Any attempt to arm * @timer after this function returns will be silently ignored. * * This function is useful for teardown code and should only be used when * timer_shutdown_sync() cannot be invoked due to locking or context constraints. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending */ int timer_shutdown(struct timer_list *timer) { return __timer_delete(timer, true); } EXPORT_SYMBOL_GPL(timer_shutdown); /** * __try_to_del_timer_sync - Internal function: Try to deactivate a timer * @timer: Timer to deactivate * @shutdown: If true, this indicates that the timer is about to be * shutdown permanently. * * If @shutdown is true then @timer->function is set to NULL under the * timer base lock which prevents further rearming of the timer. Any * attempt to rearm @timer after this function returns will be silently * ignored. * * This function cannot guarantee that the timer cannot be rearmed * right after dropping the base lock if @shutdown is false. That * needs to be prevented by the calling code if necessary. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) { struct timer_base *base; unsigned long flags; int ret = -1; debug_assert_init(timer); base = lock_timer_base(timer, &flags); if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); if (shutdown) timer->function = NULL; raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: Timer to deactivate * * This function tries to deactivate a timer. On success the timer is not * queued and the timer callback function is not running on any CPU. * * This function does not guarantee that the timer cannot be rearmed right * after dropping the base lock. That needs to be prevented by the calling * code if necessary. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ int try_to_del_timer_sync(struct timer_list *timer) { return __try_to_del_timer_sync(timer, false); } EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) { spin_lock_init(&base->expiry_lock); } static inline void timer_base_lock_expiry(struct timer_base *base) { spin_lock(&base->expiry_lock); } static inline void timer_base_unlock_expiry(struct timer_base *base) { spin_unlock(&base->expiry_lock); } /* * The counterpart to del_timer_wait_running(). * * If there is a waiter for base->expiry_lock, then it was waiting for the * timer callback to finish. Drop expiry_lock and reacquire it. That allows * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) { if (atomic_read(&base->timer_waiters)) { raw_spin_unlock_irq(&base->lock); spin_unlock(&base->expiry_lock); spin_lock(&base->expiry_lock); raw_spin_lock_irq(&base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion, if the softirq thread on a remote CPU * got preempted, and it prevents a life lock when the task which tries to * delete a timer preempted the softirq thread running the timer callback * function. */ static void del_timer_wait_running(struct timer_list *timer) { u32 tf; tf = READ_ONCE(timer->flags); if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { struct timer_base *base = get_timer_base(tf); /* * Mark the base as contended and grab the expiry lock, * which is held by the softirq across the timer * callback. Drop the lock immediately so the softirq can * expire the next timer. In theory the timer could already * be running again, but that's more than unlikely and just * causes another wait loop. */ atomic_inc(&base->timer_waiters); spin_lock_bh(&base->expiry_lock); atomic_dec(&base->timer_waiters); spin_unlock_bh(&base->expiry_lock); } } #else static inline void timer_base_init_expiry_lock(struct timer_base *base) { } static inline void timer_base_lock_expiry(struct timer_base *base) { } static inline void timer_base_unlock_expiry(struct timer_base *base) { } static inline void timer_sync_wait_running(struct timer_base *base) { } static inline void del_timer_wait_running(struct timer_list *timer) { } #endif /** * __timer_delete_sync - Internal function: Deactivate a timer and wait * for the handler to finish. * @timer: The timer to be deactivated * @shutdown: If true, @timer->function will be set to NULL under the * timer base lock which prevents rearming of @timer * * If @shutdown is not set the timer can be rearmed later. If the timer can * be rearmed concurrently, i.e. after dropping the base lock then the * return value is meaningless. * * If @shutdown is set then @timer->function is set to NULL under timer * base lock which prevents rearming of the timer. Any attempt to rearm * a shutdown timer is silently ignored. * * If the timer should be reused after shutdown it has to be initialized * again. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ static int __timer_delete_sync(struct timer_list *timer, bool shutdown) { int ret; #ifdef CONFIG_LOCKDEP unsigned long flags; /* * If lockdep gives a backtrace here, please reference * the synchronization rules above. */ local_irq_save(flags); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); local_irq_restore(flags); #endif /* * don't use it in hardirq context, because it * could lead to deadlock. */ WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE)); /* * Must be able to sleep on PREEMPT_RT because of the slowpath in * del_timer_wait_running(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) lockdep_assert_preemption_enabled(); do { ret = __try_to_del_timer_sync(timer, shutdown); if (unlikely(ret < 0)) { del_timer_wait_running(timer); cpu_relax(); } } while (ret < 0); return ret; } /** * timer_delete_sync - Deactivate a timer and wait for the handler to finish. * @timer: The timer to be deactivated * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts unless the timer is an irqsafe one. The caller must * not hold locks which would prevent completion of the timer's callback * function. The timer's handler must not call add_timer_on(). Upon exit * the timer is not queued and the handler is not running on any CPU. * * For !irqsafe timers, the caller must not hold locks that are held in * interrupt context. Even if the lock has nothing to do with the timer in * question. Here's why:: * * CPU0 CPU1 * ---- ---- * <SOFTIRQ> * call_timer_fn(); * base->running_timer = mytimer; * spin_lock_irq(somelock); * <IRQ> * spin_lock(somelock); * timer_delete_sync(mytimer); * while (base->running_timer == mytimer); * * Now timer_delete_sync() will never return and never release somelock. * The interrupt on the other CPU is waiting to grab somelock but it has * interrupted the softirq that CPU0 is waiting to finish. * * This function cannot guarantee that the timer is not rearmed again by * some concurrent or preempting code, right after it dropped the base * lock. If there is the possibility of a concurrent rearm then the return * value of the function is meaningless. * * If such a guarantee is needed, e.g. for teardown situations then use * timer_shutdown_sync() instead. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending and deactivated */ int timer_delete_sync(struct timer_list *timer) { return __timer_delete_sync(timer, false); } EXPORT_SYMBOL(timer_delete_sync); /** * timer_shutdown_sync - Shutdown a timer and prevent rearming * @timer: The timer to be shutdown * * When the function returns it is guaranteed that: * - @timer is not queued * - The callback function of @timer is not running * - @timer cannot be enqueued again. Any attempt to rearm * @timer is silently ignored. * * See timer_delete_sync() for synchronization rules. * * This function is useful for final teardown of an infrastructure where * the timer is subject to a circular dependency problem. * * A common pattern for this is a timer and a workqueue where the timer can * schedule work and work can arm the timer. On shutdown the workqueue must * be destroyed and the timer must be prevented from rearming. Unless the * code has conditionals like 'if (mything->in_shutdown)' to prevent that * there is no way to get this correct with timer_delete_sync(). * * timer_shutdown_sync() is solving the problem. The correct ordering of * calls in this case is: * * timer_shutdown_sync(&mything->timer); * workqueue_destroy(&mything->workqueue); * * After this 'mything' can be safely freed. * * This obviously implies that the timer is not required to be functional * for the rest of the shutdown operation. * * Return: * * %0 - The timer was not pending * * %1 - The timer was pending */ int timer_shutdown_sync(struct timer_list *timer) { return __timer_delete_sync(timer, true); } EXPORT_SYMBOL_GPL(timer_shutdown_sync); static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *), unsigned long baseclk) { int count = preempt_count(); #ifdef CONFIG_LOCKDEP /* * It is permissible to free the timer from inside the * function that is called from it, this we need to take into * account for lockdep too. To avoid bogus "held lock freed" * warnings as well as problems when looking into * timer->lockdep_map, make a copy and use that here. */ struct lockdep_map lockdep_map; lockdep_copy_map(&lockdep_map, &timer->lockdep_map); #endif /* * Couple the lock chain with the lock chain at * timer_delete_sync() by acquiring the lock_map around the fn() * call here and in timer_delete_sync(). */ lock_map_acquire(&lockdep_map); trace_timer_expire_entry(timer, baseclk); fn(timer); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); if (count != preempt_count()) { WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had. */ preempt_count_set(count); } } static void expire_timers(struct timer_base *base, struct hlist_head *head) { /* * This value is required only for tracing. base->clk was * incremented directly before expire_timers was called. But expiry * is related to the old base->clk value. */ unsigned long baseclk = base->clk - 1; while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(struct timer_list *); timer = hlist_entry(head->first, struct timer_list, entry); base->running_timer = timer; detach_timer(timer, true); fn = timer->function; if (WARN_ON_ONCE(!fn)) { /* Should never happen. Emphasis on should! */ base->running_timer = NULL; continue; } if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); call_timer_fn(timer, fn, baseclk); raw_spin_lock(&base->lock); base->running_timer = NULL; } else { raw_spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, baseclk); raw_spin_lock_irq(&base->lock); base->running_timer = NULL; timer_sync_wait_running(base); } } } static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { unsigned long clk = base->clk = base->next_expiry; struct hlist_head *vec; int i, levels = 0; unsigned int idx; for (i = 0; i < LVL_DEPTH; i++) { idx = (clk & LVL_MASK) + i * LVL_SIZE; if (__test_and_clear_bit(idx, base->pending_map)) { vec = base->vectors + idx; hlist_move_list(vec, heads++); levels++; } /* Is it time to look at the next level? */ if (clk & LVL_CLK_MASK) break; /* Shift clock for the next level granularity */ clk >>= LVL_CLK_SHIFT; } return levels; } /* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level * (@offset) up to @offset + clk. */ static int next_pending_bucket(struct timer_base *base, unsigned offset, unsigned clk) { unsigned pos, start = offset + clk; unsigned end = offset + LVL_SIZE; pos = find_next_bit(base->pending_map, end, start); if (pos < end) return pos - start; pos = find_next_bit(base->pending_map, start, offset); return pos < start ? pos + LVL_SIZE - start : -1; } /* * Search the first expiring timer in the various clock levels. Caller must * hold base->lock. * * Store next expiry time in base->next_expiry. */ static void next_expiry_recalc(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; next = base->clk + NEXT_TIMER_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); unsigned long lvl_clk = clk & LVL_CLK_MASK; if (pos >= 0) { unsigned long tmp = clk + (unsigned long) pos; tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next)) next = tmp; /* * If the next expiration happens before we reach * the next level, no need to check further. */ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) break; } /* * Clock for the next level. If the current level clock lower * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next * expiring jiffie. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 * * we have to look at all levels @index 0. With * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 2 * * LVL0 has the next expiring bucket @index 2. The upper * levels have the next expiring bucket @index 1. * * In case that the propagation wraps the next level the same * rules apply: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 F 2 * * So after looking at LVL0 we get: * * LVL5 LVL4 LVL3 LVL2 LVL1 * 0 0 0 1 0 * * So no propagation from LVL1 to LVL2 because that happened * with the add already, but then we need to propagate further * from LVL2 to LVL3. * * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases. */ adj = lvl_clk ? 1 : 0; clk >>= LVL_CLK_SHIFT; clk += adj; } base->next_expiry = next; base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); } #ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event: */ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { u64 nextevt = hrtimer_get_next_event(); /* * If high resolution timers are enabled * hrtimer_get_next_event() returns KTIME_MAX. */ if (expires <= nextevt) return expires; /* * If the next timer is already expired, return the tick base * time so the tick is fired immediately. */ if (nextevt <= basem) return basem; /* * Round up to the next jiffie. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. * * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 */ return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } static unsigned long next_timer_interrupt(struct timer_base *base, unsigned long basej) { if (base->next_expiry_recalc) next_expiry_recalc(base); /* * Move next_expiry for the empty base into the future to prevent an * unnecessary raise of the timer softirq when the next_expiry value * will be reached even if there is no timer pending. * * This update is also required to make timer_base::next_expiry values * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) base->next_expiry = basej + NEXT_TIMER_MAX_DELTA; return base->next_expiry; } static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem, struct timer_base *base_local, struct timer_base *base_global, struct timer_events *tevt) { unsigned long nextevt, nextevt_local, nextevt_global; bool local_first; nextevt_local = next_timer_interrupt(base_local, basej); nextevt_global = next_timer_interrupt(base_global, basej); local_first = time_before_eq(nextevt_local, nextevt_global); nextevt = local_first ? nextevt_local : nextevt_global; /* * If the @nextevt is at max. one tick away, use @nextevt and store * it in the local expiry value. The next global event is irrelevant in * this case and can be left as KTIME_MAX. */ if (time_before_eq(nextevt, basej + 1)) { /* If we missed a tick already, force 0 delta */ if (time_before(nextevt, basej)) nextevt = basej; tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC; /* * This is required for the remote check only but it doesn't * hurt, when it is done for both call sites: * * * The remote callers will only take care of the global timers * as local timers will be handled by CPU itself. When not * updating tevt->global with the already missed first global * timer, it is possible that it will be missed completely. * * * The local callers will ignore the tevt->global anyway, when * nextevt is max. one tick away. */ if (!local_first) tevt->global = tevt->local; return nextevt; } /* * Update tevt.* values: * * If the local queue expires first, then the global event can be * ignored. If the global queue is empty, nothing to do either. */ if (!local_first && base_global->timers_pending) tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC; if (base_local->timers_pending) tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC; return nextevt; } # ifdef CONFIG_SMP /** * fetch_next_timer_interrupt_remote() - Store next timers into @tevt * @basej: base time jiffies * @basem: base time clock monotonic * @tevt: Pointer to the storage for the expiry values * @cpu: Remote CPU * * Stores the next pending local and global timer expiry values in the * struct pointed to by @tevt. If a queue is empty the corresponding * field is set to KTIME_MAX. If local event expires before global * event, global event is set to KTIME_MAX as well. * * Caller needs to make sure timer base locks are held (use * timer_lock_remote_bases() for this purpose). */ void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem, struct timer_events *tevt, unsigned int cpu) { struct timer_base *base_local, *base_global; /* Preset local / global events */ tevt->local = tevt->global = KTIME_MAX; base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); lockdep_assert_held(&base_local->lock); lockdep_assert_held(&base_global->lock); fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt); } /** * timer_unlock_remote_bases - unlock timer bases of cpu * @cpu: Remote CPU * * Unlocks the remote timer bases. */ void timer_unlock_remote_bases(unsigned int cpu) __releases(timer_bases[BASE_LOCAL]->lock) __releases(timer_bases[BASE_GLOBAL]->lock) { struct timer_base *base_local, *base_global; base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); raw_spin_unlock(&base_global->lock); raw_spin_unlock(&base_local->lock); } /** * timer_lock_remote_bases - lock timer bases of cpu * @cpu: Remote CPU * * Locks the remote timer bases. */ void timer_lock_remote_bases(unsigned int cpu) __acquires(timer_bases[BASE_LOCAL]->lock) __acquires(timer_bases[BASE_GLOBAL]->lock) { struct timer_base *base_local, *base_global; base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu); base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); lockdep_assert_irqs_disabled(); raw_spin_lock(&base_local->lock); raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); } /** * timer_base_is_idle() - Return whether timer base is set idle * * Returns value of local timer base is_idle value. */ bool timer_base_is_idle(void) { return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle); } static void __run_timer_base(struct timer_base *base); /** * timer_expire_remote() - expire global timers of cpu * @cpu: Remote CPU * * Expire timers of global base of remote CPU. */ void timer_expire_remote(unsigned int cpu) { struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu); __run_timer_base(base); } static void timer_use_tmigr(unsigned long basej, u64 basem, unsigned long *nextevt, bool *tick_stop_path, bool timer_base_idle, struct timer_events *tevt) { u64 next_tmigr; if (timer_base_idle) next_tmigr = tmigr_cpu_new_timer(tevt->global); else if (tick_stop_path) next_tmigr = tmigr_cpu_deactivate(tevt->global); else next_tmigr = tmigr_quick_check(tevt->global); /* * If the CPU is the last going idle in timer migration hierarchy, make * sure the CPU will wake up in time to handle remote timers. * next_tmigr == KTIME_MAX if other CPUs are still active. */ if (next_tmigr < tevt->local) { u64 tmp; /* If we missed a tick already, force 0 delta */ if (next_tmigr < basem) next_tmigr = basem; tmp = div_u64(next_tmigr - basem, TICK_NSEC); *nextevt = basej + (unsigned long)tmp; tevt->local = next_tmigr; } } # else static void timer_use_tmigr(unsigned long basej, u64 basem, unsigned long *nextevt, bool *tick_stop_path, bool timer_base_idle, struct timer_events *tevt) { /* * Make sure first event is written into tevt->local to not miss a * timer on !SMP systems. */ tevt->local = min_t(u64, tevt->local, tevt->global); } # endif /* CONFIG_SMP */ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, bool *idle) { struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX }; struct timer_base *base_local, *base_global; unsigned long nextevt; bool idle_is_possible; /* * When the CPU is offline, the tick is cancelled and nothing is supposed * to try to stop it. */ if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) { if (idle) *idle = true; return tevt.local; } base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]); base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]); raw_spin_lock(&base_local->lock); raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING); nextevt = fetch_next_timer_interrupt(basej, basem, base_local, base_global, &tevt); /* * If the next event is only one jiffie ahead there is no need to call * timer migration hierarchy related functions. The value for the next * global timer in @tevt struct equals then KTIME_MAX. This is also * true, when the timer base is idle. * * The proper timer migration hierarchy function depends on the callsite * and whether timer base is idle or not. @nextevt will be updated when * this CPU needs to handle the first timer migration hierarchy * event. See timer_use_tmigr() for detailed information. */ idle_is_possible = time_after(nextevt, basej + 1); if (idle_is_possible) timer_use_tmigr(basej, basem, &nextevt, idle, base_local->is_idle, &tevt); /* * We have a fresh next event. Check whether we can forward the * base. */ __forward_timer_base(base_local, basej); __forward_timer_base(base_global, basej); /* * Set base->is_idle only when caller is timer_base_try_to_set_idle() */ if (idle) { /* * Bases are idle if the next event is more than a tick * away. Caution: @nextevt could have changed by enqueueing a * global timer into timer migration hierarchy. Therefore a new * check is required here. * * If the base is marked idle then any timer add operation must * forward the base clk itself to keep granularity small. This * idle logic is only maintained for the BASE_LOCAL and * BASE_GLOBAL base, deferrable timers may still see large * granularity skew (by design). */ if (!base_local->is_idle && time_after(nextevt, basej + 1)) { base_local->is_idle = true; /* * Global timers queued locally while running in a task * in nohz_full mode need a self-IPI to kick reprogramming * in IRQ tail. */ if (tick_nohz_full_cpu(base_local->cpu)) base_global->is_idle = true; trace_timer_base_idle(true, base_local->cpu); } *idle = base_local->is_idle; /* * When timer base is not set idle, undo the effect of * tmigr_cpu_deactivate() to prevent inconsistent states - active * timer base but inactive timer migration hierarchy. * * When timer base was already marked idle, nothing will be * changed here. */ if (!base_local->is_idle && idle_is_possible) tmigr_cpu_activate(); } raw_spin_unlock(&base_global->lock); raw_spin_unlock(&base_local->lock); return cmp_next_hrtimer_event(basem, tevt.local); } /** * get_next_timer_interrupt() - return the time (clock mono) of the next timer * @basej: base time jiffies * @basem: base time clock monotonic * * Returns the tick aligned clock monotonic time of the next pending timer or * KTIME_MAX if no timer is pending. If timer of global base was queued into * timer migration hierarchy, first global timer is not taken into account. If * it was the last CPU of timer migration hierarchy going idle, first global * event is taken into account. */ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { return __get_next_timer_interrupt(basej, basem, NULL); } /** * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases * @basej: base time jiffies * @basem: base time clock monotonic * @idle: pointer to store the value of timer_base->is_idle on return; * *idle contains the information whether tick was already stopped * * Returns the tick aligned clock monotonic time of the next pending timer or * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is * returned as well. */ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle) { if (*idle) return KTIME_MAX; return __get_next_timer_interrupt(basej, basem, idle); } /** * timer_clear_idle - Clear the idle state of the timer base * * Called with interrupts disabled */ void timer_clear_idle(void) { /* * We do this unlocked. The worst outcome is a remote pinned timer * enqueue sending a pointless IPI, but taking the lock would just * make the window for sending the IPI a few instructions smaller * for the cost of taking the lock in the exit from idle * path. Required for BASE_LOCAL only. */ __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); if (tick_nohz_full_cpu(smp_processor_id())) __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); trace_timer_base_idle(false, smp_processor_id()); /* Activate without holding the timer_base->lock */ tmigr_cpu_activate(); } #endif /** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. */ static inline void __run_timers(struct timer_base *base) { struct hlist_head heads[LVL_DEPTH]; int levels; lockdep_assert_held(&base->lock); if (base->running_timer) return; while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); /* * The two possible reasons for not finding any expired * timer at this clk are that all matching timers have been * dequeued or no timer has been queued since * base::next_expiry was set to base::clk + * NEXT_TIMER_MAX_DELTA. */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); /* * While executing timers, base->clk is set 1 offset ahead of * jiffies to avoid endless requeuing to current jiffies. */ base->clk++; next_expiry_recalc(base); while (levels--) expire_timers(base, heads + levels); } } static void __run_timer_base(struct timer_base *base) { if (time_before(jiffies, base->next_expiry)) return; timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); __run_timers(base); raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } static void run_timer_base(int index) { struct timer_base *base = this_cpu_ptr(&timer_bases[index]); __run_timer_base(base); } /* * This function runs timers and the timer-tq in bottom half context. */ static __latent_entropy void run_timer_softirq(struct softirq_action *h) { run_timer_base(BASE_LOCAL); if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { run_timer_base(BASE_GLOBAL); run_timer_base(BASE_DEF); if (is_timers_nohz_active()) tmigr_handle_remote(); } } /* * Called by the local, per-CPU timer interrupt on SMP. */ static void run_local_timers(void) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]); hrtimer_run_queues(); for (int i = 0; i < NR_BASES; i++, base++) { /* Raise the softirq only if required. */ if (time_after_eq(jiffies, base->next_expiry) || (i == BASE_DEF && tmigr_requires_handle_remote())) { raise_softirq(TIMER_SOFTIRQ); return; } } } /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. */ void update_process_times(int user_tick) { struct task_struct *p = current; /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_tick(); #endif sched_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); } /* * Since schedule_timeout()'s timer is defined on the stack, it must store * the target task on the stack as well. */ struct process_timer { struct timer_list timer; struct task_struct *task; }; static void process_timeout(struct timer_list *t) { struct process_timer *timeout = from_timer(timeout, t, timer); wake_up_process(timeout->task); } /** * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * * Make the current task sleep until @timeout jiffies have elapsed. * The function behavior depends on the current task state * (see also set_current_state() description): * * %TASK_RUNNING - the scheduler is called, but the task does not sleep * at all. That happens because sched_submit_work() does nothing for * tasks in %TASK_RUNNING state. * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be %TASK_RUNNING when this * routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * * Returns 0 when the timer has expired otherwise the remaining time in * jiffies will be returned. In all cases the return value is guaranteed * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { struct process_timer timer; unsigned long expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " "value %lx\n", timeout); dump_stack(); __set_current_state(TASK_RUNNING); goto out; } } expire = timeout + jiffies; timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); schedule(); del_timer_sync(&timer.timer); /* Remove the timer from the object tracker */ destroy_timer_on_stack(&timer.timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } EXPORT_SYMBOL(schedule_timeout); /* * We can use __set_current_state() here because schedule_timeout() calls * schedule() unconditionally. */ signed long __sched schedule_timeout_interruptible(signed long timeout) { __set_current_state(TASK_INTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_interruptible); signed long __sched schedule_timeout_killable(signed long timeout) { __set_current_state(TASK_KILLABLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_killable); signed long __sched schedule_timeout_uninterruptible(signed long timeout) { __set_current_state(TASK_UNINTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_uninterruptible); /* * Like schedule_timeout_uninterruptible(), except this task will not contribute * to load average. */ signed long __sched schedule_timeout_idle(signed long timeout) { __set_current_state(TASK_IDLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_idle); #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { struct timer_list *timer; int cpu = new_base->cpu; while (!hlist_empty(head)) { timer = hlist_entry(head->first, struct timer_list, entry); detach_timer(timer, false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } int timers_prepare_cpu(unsigned int cpu) { struct timer_base *base; int b; for (b = 0; b < NR_BASES; b++) { base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; } return 0; } int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; struct timer_base *new_base; int b, i; for (b = 0; b < NR_BASES; b++) { old_base = per_cpu_ptr(&timer_bases[b], cpu); new_base = get_cpu_ptr(&timer_bases[b]); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock_irq(&new_base->lock); raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); /* * The current CPUs base clock might be stale. Update it * before moving the timers over. */ forward_timer_base(new_base); WARN_ON_ONCE(old_base->running_timer); old_base->running_timer = NULL; for (i = 0; i < WHEEL_SIZE; i++) migrate_timer_list(new_base, old_base->vectors + i); raw_spin_unlock(&old_base->lock); raw_spin_unlock_irq(&new_base->lock); put_cpu_ptr(&timer_bases); } return 0; } #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) { struct timer_base *base; int i; for (i = 0; i < NR_BASES; i++) { base = per_cpu_ptr(&timer_bases[i], cpu); base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; timer_base_init_expiry_lock(base); } } static void __init init_timer_cpus(void) { int cpu; for_each_possible_cpu(cpu) init_timer_cpu(cpu); } void __init init_timers(void) { init_timer_cpus(); posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } /** * msleep - sleep safely even with waitqueue interruptions * @msecs: Time in milliseconds to sleep for */ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; while (timeout) timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); /** * msleep_interruptible - sleep waiting for signals * @msecs: Time in milliseconds to sleep for */ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } EXPORT_SYMBOL(msleep_interruptible); /** * usleep_range_state - Sleep for an approximate time in a given state * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep * @state: State of the current task that will be while sleeping * * In non-atomic context where the exact wakeup time is flexible, use * usleep_range_state() instead of udelay(). The sleep improves responsiveness * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces * power usage by allowing hrtimers to take advantage of an already- * scheduled interrupt instead of scheduling a new one just for this sleep. */ void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; for (;;) { __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } EXPORT_SYMBOL(usleep_range_state);
6 136 40 32 24 1 2 2 1 131 133 133 71 144 143 71 4 136 142 23 141 139 135 135 135 142 139 142 144 144 144 144 143 131 131 131 131 130 131 144 136 6 135 136 135 144 17 17 16 17 17 17 17 8 8 8 4 3 11 11 2 2 7 9 9 9 5 5 1 5 5 28 28 28 28 6 6 6 33 33 33 33 33 5 4 3 3 2 2 2 2 31 32 32 6 8 6 1 7 4 5 2 6 2 2 24 26 26 4 26 24 24 25 24 24 24 4 4 4 4 14 14 14 14 166 13 170 170 171 171 1 171 165 38 171 169 170 170 160 171 165 171 171 7 7 7 144 144 144 144 143 24 144 136 136 136 10 10 10 12 135 136 9 9 136 131 131 131 130 108 12 136 131 131 130 131 130 4 135 136 133 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 // SPDX-License-Identifier: GPL-2.0+ /* * dummy_hcd.c -- Dummy/Loopback USB host and device emulator driver. * * Maintainer: Alan Stern <stern@rowland.harvard.edu> * * Copyright (C) 2003 David Brownell * Copyright (C) 2003-2005 Alan Stern */ /* * This exposes a device side "USB gadget" API, driven by requests to a * Linux-USB host controller driver. USB traffic is simulated; there's * no need for USB hardware. Use this with two other drivers: * * - Gadget driver, responding to requests (device); * - Host-side device driver, as already familiar in Linux. * * Having this all in one kernel can help some stages of development, * bypassing some hardware (and driver) issues. UML could help too. * * Note: The emulation does not include isochronous transfers! */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/delay.h> #include <linux/ioport.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/hrtimer.h> #include <linux/list.h> #include <linux/interrupt.h> #include <linux/platform_device.h> #include <linux/usb.h> #include <linux/usb/gadget.h> #include <linux/usb/hcd.h> #include <linux/scatterlist.h> #include <asm/byteorder.h> #include <linux/io.h> #include <asm/irq.h> #include <asm/unaligned.h> #define DRIVER_DESC "USB Host+Gadget Emulator" #define DRIVER_VERSION "02 May 2005" #define POWER_BUDGET 500 /* in mA; use 8 for low-power port testing */ #define POWER_BUDGET_3 900 /* in mA */ #define DUMMY_TIMER_INT_NSECS 125000 /* 1 microframe */ static const char driver_name[] = "dummy_hcd"; static const char driver_desc[] = "USB Host+Gadget Emulator"; static const char gadget_name[] = "dummy_udc"; MODULE_DESCRIPTION(DRIVER_DESC); MODULE_AUTHOR("David Brownell"); MODULE_LICENSE("GPL"); struct dummy_hcd_module_parameters { bool is_super_speed; bool is_high_speed; unsigned int num; }; static struct dummy_hcd_module_parameters mod_data = { .is_super_speed = false, .is_high_speed = true, .num = 1, }; module_param_named(is_super_speed, mod_data.is_super_speed, bool, S_IRUGO); MODULE_PARM_DESC(is_super_speed, "true to simulate SuperSpeed connection"); module_param_named(is_high_speed, mod_data.is_high_speed, bool, S_IRUGO); MODULE_PARM_DESC(is_high_speed, "true to simulate HighSpeed connection"); module_param_named(num, mod_data.num, uint, S_IRUGO); MODULE_PARM_DESC(num, "number of emulated controllers"); /*-------------------------------------------------------------------------*/ /* gadget side driver data structres */ struct dummy_ep { struct list_head queue; unsigned long last_io; /* jiffies timestamp */ struct usb_gadget *gadget; const struct usb_endpoint_descriptor *desc; struct usb_ep ep; unsigned halted:1; unsigned wedged:1; unsigned already_seen:1; unsigned setup_stage:1; unsigned stream_en:1; }; struct dummy_request { struct list_head queue; /* ep's requests */ struct usb_request req; }; static inline struct dummy_ep *usb_ep_to_dummy_ep(struct usb_ep *_ep) { return container_of(_ep, struct dummy_ep, ep); } static inline struct dummy_request *usb_request_to_dummy_request (struct usb_request *_req) { return container_of(_req, struct dummy_request, req); } /*-------------------------------------------------------------------------*/ /* * Every device has ep0 for control requests, plus up to 30 more endpoints, * in one of two types: * * - Configurable: direction (in/out), type (bulk, iso, etc), and endpoint * number can be changed. Names like "ep-a" are used for this type. * * - Fixed Function: in other cases. some characteristics may be mutable; * that'd be hardware-specific. Names like "ep12out-bulk" are used. * * Gadget drivers are responsible for not setting up conflicting endpoint * configurations, illegal or unsupported packet lengths, and so on. */ static const char ep0name[] = "ep0"; static const struct { const char *name; const struct usb_ep_caps caps; } ep_info[] = { #define EP_INFO(_name, _caps) \ { \ .name = _name, \ .caps = _caps, \ } /* we don't provide isochronous endpoints since we don't support them */ #define TYPE_BULK_OR_INT (USB_EP_CAPS_TYPE_BULK | USB_EP_CAPS_TYPE_INT) /* everyone has ep0 */ EP_INFO(ep0name, USB_EP_CAPS(USB_EP_CAPS_TYPE_CONTROL, USB_EP_CAPS_DIR_ALL)), /* act like a pxa250: fifteen fixed function endpoints */ EP_INFO("ep1in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), EP_INFO("ep2out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), /* EP_INFO("ep3in-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)), EP_INFO("ep4out-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)), */ EP_INFO("ep5in-int", USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep6in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), EP_INFO("ep7out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), /* EP_INFO("ep8in-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)), EP_INFO("ep9out-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)), */ EP_INFO("ep10in-int", USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep11in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), EP_INFO("ep12out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), /* EP_INFO("ep13in-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_IN)), EP_INFO("ep14out-iso", USB_EP_CAPS(USB_EP_CAPS_TYPE_ISO, USB_EP_CAPS_DIR_OUT)), */ EP_INFO("ep15in-int", USB_EP_CAPS(USB_EP_CAPS_TYPE_INT, USB_EP_CAPS_DIR_IN)), /* or like sa1100: two fixed function endpoints */ EP_INFO("ep1out-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep2in-bulk", USB_EP_CAPS(USB_EP_CAPS_TYPE_BULK, USB_EP_CAPS_DIR_IN)), /* and now some generic EPs so we have enough in multi config */ EP_INFO("ep-aout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-bin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-cout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-dout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-ein", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-fout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-gin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-hout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-iout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-jin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-kout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), EP_INFO("ep-lin", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_IN)), EP_INFO("ep-mout", USB_EP_CAPS(TYPE_BULK_OR_INT, USB_EP_CAPS_DIR_OUT)), #undef EP_INFO }; #define DUMMY_ENDPOINTS ARRAY_SIZE(ep_info) /*-------------------------------------------------------------------------*/ #define FIFO_SIZE 64 struct urbp { struct urb *urb; struct list_head urbp_list; struct sg_mapping_iter miter; u32 miter_started; }; enum dummy_rh_state { DUMMY_RH_RESET, DUMMY_RH_SUSPENDED, DUMMY_RH_RUNNING }; struct dummy_hcd { struct dummy *dum; enum dummy_rh_state rh_state; struct hrtimer timer; u32 port_status; u32 old_status; unsigned long re_timeout; struct usb_device *udev; struct list_head urbp_list; struct urbp *next_frame_urbp; u32 stream_en_ep; u8 num_stream[30 / 2]; unsigned active:1; unsigned old_active:1; unsigned resuming:1; }; struct dummy { spinlock_t lock; /* * DEVICE/GADGET side support */ struct dummy_ep ep[DUMMY_ENDPOINTS]; int address; int callback_usage; struct usb_gadget gadget; struct usb_gadget_driver *driver; struct dummy_request fifo_req; u8 fifo_buf[FIFO_SIZE]; u16 devstatus; unsigned ints_enabled:1; unsigned udc_suspended:1; unsigned pullup:1; /* * HOST side support */ struct dummy_hcd *hs_hcd; struct dummy_hcd *ss_hcd; }; static inline struct dummy_hcd *hcd_to_dummy_hcd(struct usb_hcd *hcd) { return (struct dummy_hcd *) (hcd->hcd_priv); } static inline struct usb_hcd *dummy_hcd_to_hcd(struct dummy_hcd *dum) { return container_of((void *) dum, struct usb_hcd, hcd_priv); } static inline struct device *dummy_dev(struct dummy_hcd *dum) { return dummy_hcd_to_hcd(dum)->self.controller; } static inline struct device *udc_dev(struct dummy *dum) { return dum->gadget.dev.parent; } static inline struct dummy *ep_to_dummy(struct dummy_ep *ep) { return container_of(ep->gadget, struct dummy, gadget); } static inline struct dummy_hcd *gadget_to_dummy_hcd(struct usb_gadget *gadget) { struct dummy *dum = container_of(gadget, struct dummy, gadget); if (dum->gadget.speed == USB_SPEED_SUPER) return dum->ss_hcd; else return dum->hs_hcd; } static inline struct dummy *gadget_dev_to_dummy(struct device *dev) { return container_of(dev, struct dummy, gadget.dev); } /*-------------------------------------------------------------------------*/ /* DEVICE/GADGET SIDE UTILITY ROUTINES */ /* called with spinlock held */ static void nuke(struct dummy *dum, struct dummy_ep *ep) { while (!list_empty(&ep->queue)) { struct dummy_request *req; req = list_entry(ep->queue.next, struct dummy_request, queue); list_del_init(&req->queue); req->req.status = -ESHUTDOWN; spin_unlock(&dum->lock); usb_gadget_giveback_request(&ep->ep, &req->req); spin_lock(&dum->lock); } } /* caller must hold lock */ static void stop_activity(struct dummy *dum) { int i; /* prevent any more requests */ dum->address = 0; /* The timer is left running so that outstanding URBs can fail */ /* nuke any pending requests first, so driver i/o is quiesced */ for (i = 0; i < DUMMY_ENDPOINTS; ++i) nuke(dum, &dum->ep[i]); /* driver now does any non-usb quiescing necessary */ } /** * set_link_state_by_speed() - Sets the current state of the link according to * the hcd speed * @dum_hcd: pointer to the dummy_hcd structure to update the link state for * * This function updates the port_status according to the link state and the * speed of the hcd. */ static void set_link_state_by_speed(struct dummy_hcd *dum_hcd) { struct dummy *dum = dum_hcd->dum; if (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3) { if ((dum_hcd->port_status & USB_SS_PORT_STAT_POWER) == 0) { dum_hcd->port_status = 0; } else if (!dum->pullup || dum->udc_suspended) { /* UDC suspend must cause a disconnect */ dum_hcd->port_status &= ~(USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE); if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) != 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); } else { /* device is connected and not suspended */ dum_hcd->port_status |= (USB_PORT_STAT_CONNECTION | USB_PORT_STAT_SPEED_5GBPS) ; if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) == 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) && (dum_hcd->port_status & USB_PORT_STAT_LINK_STATE) == USB_SS_PORT_LS_U0 && dum_hcd->rh_state != DUMMY_RH_SUSPENDED) dum_hcd->active = 1; } } else { if ((dum_hcd->port_status & USB_PORT_STAT_POWER) == 0) { dum_hcd->port_status = 0; } else if (!dum->pullup || dum->udc_suspended) { /* UDC suspend must cause a disconnect */ dum_hcd->port_status &= ~(USB_PORT_STAT_CONNECTION | USB_PORT_STAT_ENABLE | USB_PORT_STAT_LOW_SPEED | USB_PORT_STAT_HIGH_SPEED | USB_PORT_STAT_SUSPEND); if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) != 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); } else { dum_hcd->port_status |= USB_PORT_STAT_CONNECTION; if ((dum_hcd->old_status & USB_PORT_STAT_CONNECTION) == 0) dum_hcd->port_status |= (USB_PORT_STAT_C_CONNECTION << 16); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) == 0) dum_hcd->port_status &= ~USB_PORT_STAT_SUSPEND; else if ((dum_hcd->port_status & USB_PORT_STAT_SUSPEND) == 0 && dum_hcd->rh_state != DUMMY_RH_SUSPENDED) dum_hcd->active = 1; } } } /* caller must hold lock */ static void set_link_state(struct dummy_hcd *dum_hcd) __must_hold(&dum->lock) { struct dummy *dum = dum_hcd->dum; unsigned int power_bit; dum_hcd->active = 0; if (dum->pullup) if ((dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3 && dum->gadget.speed != USB_SPEED_SUPER) || (dummy_hcd_to_hcd(dum_hcd)->speed != HCD_USB3 && dum->gadget.speed == USB_SPEED_SUPER)) return; set_link_state_by_speed(dum_hcd); power_bit = (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3 ? USB_SS_PORT_STAT_POWER : USB_PORT_STAT_POWER); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) == 0 || dum_hcd->active) dum_hcd->resuming = 0; /* Currently !connected or in reset */ if ((dum_hcd->port_status & power_bit) == 0 || (dum_hcd->port_status & USB_PORT_STAT_RESET) != 0) { unsigned int disconnect = power_bit & dum_hcd->old_status & (~dum_hcd->port_status); unsigned int reset = USB_PORT_STAT_RESET & (~dum_hcd->old_status) & dum_hcd->port_status; /* Report reset and disconnect events to the driver */ if (dum->ints_enabled && (disconnect || reset)) { stop_activity(dum); ++dum->callback_usage; spin_unlock(&dum->lock); if (reset) usb_gadget_udc_reset(&dum->gadget, dum->driver); else dum->driver->disconnect(&dum->gadget); spin_lock(&dum->lock); --dum->callback_usage; } } else if (dum_hcd->active != dum_hcd->old_active && dum->ints_enabled) { ++dum->callback_usage; spin_unlock(&dum->lock); if (dum_hcd->old_active && dum->driver->suspend) dum->driver->suspend(&dum->gadget); else if (!dum_hcd->old_active && dum->driver->resume) dum->driver->resume(&dum->gadget); spin_lock(&dum->lock); --dum->callback_usage; } dum_hcd->old_status = dum_hcd->port_status; dum_hcd->old_active = dum_hcd->active; } /*-------------------------------------------------------------------------*/ /* DEVICE/GADGET SIDE DRIVER * * This only tracks gadget state. All the work is done when the host * side tries some (emulated) i/o operation. Real device controller * drivers would do real i/o using dma, fifos, irqs, timers, etc. */ #define is_enabled(dum) \ (dum->port_status & USB_PORT_STAT_ENABLE) static int dummy_enable(struct usb_ep *_ep, const struct usb_endpoint_descriptor *desc) { struct dummy *dum; struct dummy_hcd *dum_hcd; struct dummy_ep *ep; unsigned max; int retval; ep = usb_ep_to_dummy_ep(_ep); if (!_ep || !desc || ep->desc || _ep->name == ep0name || desc->bDescriptorType != USB_DT_ENDPOINT) return -EINVAL; dum = ep_to_dummy(ep); if (!dum->driver) return -ESHUTDOWN; dum_hcd = gadget_to_dummy_hcd(&dum->gadget); if (!is_enabled(dum_hcd)) return -ESHUTDOWN; /* * For HS/FS devices only bits 0..10 of the wMaxPacketSize represent the * maximum packet size. * For SS devices the wMaxPacketSize is limited by 1024. */ max = usb_endpoint_maxp(desc); /* drivers must not request bad settings, since lower levels * (hardware or its drivers) may not check. some endpoints * can't do iso, many have maxpacket limitations, etc. * * since this "hardware" driver is here to help debugging, we * have some extra sanity checks. (there could be more though, * especially for "ep9out" style fixed function ones.) */ retval = -EINVAL; switch (usb_endpoint_type(desc)) { case USB_ENDPOINT_XFER_BULK: if (strstr(ep->ep.name, "-iso") || strstr(ep->ep.name, "-int")) { goto done; } switch (dum->gadget.speed) { case USB_SPEED_SUPER: if (max == 1024) break; goto done; case USB_SPEED_HIGH: if (max == 512) break; goto done; case USB_SPEED_FULL: if (max == 8 || max == 16 || max == 32 || max == 64) /* we'll fake any legal size */ break; /* save a return statement */ fallthrough; default: goto done; } break; case USB_ENDPOINT_XFER_INT: if (strstr(ep->ep.name, "-iso")) /* bulk is ok */ goto done; /* real hardware might not handle all packet sizes */ switch (dum->gadget.speed) { case USB_SPEED_SUPER: case USB_SPEED_HIGH: if (max <= 1024) break; /* save a return statement */ fallthrough; case USB_SPEED_FULL: if (max <= 64) break; /* save a return statement */ fallthrough; default: if (max <= 8) break; goto done; } break; case USB_ENDPOINT_XFER_ISOC: if (strstr(ep->ep.name, "-bulk") || strstr(ep->ep.name, "-int")) goto done; /* real hardware might not handle all packet sizes */ switch (dum->gadget.speed) { case USB_SPEED_SUPER: case USB_SPEED_HIGH: if (max <= 1024) break; /* save a return statement */ fallthrough; case USB_SPEED_FULL: if (max <= 1023) break; /* save a return statement */ fallthrough; default: goto done; } break; default: /* few chips support control except on ep0 */ goto done; } _ep->maxpacket = max; if (usb_ss_max_streams(_ep->comp_desc)) { if (!usb_endpoint_xfer_bulk(desc)) { dev_err(udc_dev(dum), "Can't enable stream support on " "non-bulk ep %s\n", _ep->name); return -EINVAL; } ep->stream_en = 1; } ep->desc = desc; dev_dbg(udc_dev(dum), "enabled %s (ep%d%s-%s) maxpacket %d stream %s\n", _ep->name, desc->bEndpointAddress & 0x0f, (desc->bEndpointAddress & USB_DIR_IN) ? "in" : "out", usb_ep_type_string(usb_endpoint_type(desc)), max, ep->stream_en ? "enabled" : "disabled"); /* at this point real hardware should be NAKing transfers * to that endpoint, until a buffer is queued to it. */ ep->halted = ep->wedged = 0; retval = 0; done: return retval; } static int dummy_disable(struct usb_ep *_ep) { struct dummy_ep *ep; struct dummy *dum; unsigned long flags; ep = usb_ep_to_dummy_ep(_ep); if (!_ep || !ep->desc || _ep->name == ep0name) return -EINVAL; dum = ep_to_dummy(ep); spin_lock_irqsave(&dum->lock, flags); ep->desc = NULL; ep->stream_en = 0; nuke(dum, ep); spin_unlock_irqrestore(&dum->lock, flags); dev_dbg(udc_dev(dum), "disabled %s\n", _ep->name); return 0; } static struct usb_request *dummy_alloc_request(struct usb_ep *_ep, gfp_t mem_flags) { struct dummy_request *req; if (!_ep) return NULL; req = kzalloc(sizeof(*req), mem_flags); if (!req) return NULL; INIT_LIST_HEAD(&req->queue); return &req->req; } static void dummy_free_request(struct usb_ep *_ep, struct usb_request *_req) { struct dummy_request *req; if (!_ep || !_req) { WARN_ON(1); return; } req = usb_request_to_dummy_request(_req); WARN_ON(!list_empty(&req->queue)); kfree(req); } static void fifo_complete(struct usb_ep *ep, struct usb_request *req) { } static int dummy_queue(struct usb_ep *_ep, struct usb_request *_req, gfp_t mem_flags) { struct dummy_ep *ep; struct dummy_request *req; struct dummy *dum; struct dummy_hcd *dum_hcd; unsigned long flags; req = usb_request_to_dummy_request(_req); if (!_req || !list_empty(&req->queue) || !_req->complete) return -EINVAL; ep = usb_ep_to_dummy_ep(_ep); if (!_ep || (!ep->desc && _ep->name != ep0name)) return -EINVAL; dum = ep_to_dummy(ep); dum_hcd = gadget_to_dummy_hcd(&dum->gadget); if (!dum->driver || !is_enabled(dum_hcd)) return -ESHUTDOWN; #if 0 dev_dbg(udc_dev(dum), "ep %p queue req %p to %s, len %d buf %p\n", ep, _req, _ep->name, _req->length, _req->buf); #endif _req->status = -EINPROGRESS; _req->actual = 0; spin_lock_irqsave(&dum->lock, flags); /* implement an emulated single-request FIFO */ if (ep->desc && (ep->desc->bEndpointAddress & USB_DIR_IN) && list_empty(&dum->fifo_req.queue) && list_empty(&ep->queue) && _req->length <= FIFO_SIZE) { req = &dum->fifo_req; req->req = *_req; req->req.buf = dum->fifo_buf; memcpy(dum->fifo_buf, _req->buf, _req->length); req->req.context = dum; req->req.complete = fifo_complete; list_add_tail(&req->queue, &ep->queue); spin_unlock(&dum->lock); _req->actual = _req->length; _req->status = 0; usb_gadget_giveback_request(_ep, _req); spin_lock(&dum->lock); } else list_add_tail(&req->queue, &ep->queue); spin_unlock_irqrestore(&dum->lock, flags); /* real hardware would likely enable transfers here, in case * it'd been left NAKing. */ return 0; } static int dummy_dequeue(struct usb_ep *_ep, struct usb_request *_req) { struct dummy_ep *ep; struct dummy *dum; int retval = -EINVAL; unsigned long flags; struct dummy_request *req = NULL, *iter; if (!_ep || !_req) return retval; ep = usb_ep_to_dummy_ep(_ep); dum = ep_to_dummy(ep); if (!dum->driver) return -ESHUTDOWN; local_irq_save(flags); spin_lock(&dum->lock); list_for_each_entry(iter, &ep->queue, queue) { if (&iter->req != _req) continue; list_del_init(&iter->queue); _req->status = -ECONNRESET; req = iter; retval = 0; break; } spin_unlock(&dum->lock); if (retval == 0) { dev_dbg(udc_dev(dum), "dequeued req %p from %s, len %d buf %p\n", req, _ep->name, _req->length, _req->buf); usb_gadget_giveback_request(_ep, _req); } local_irq_restore(flags); return retval; } static int dummy_set_halt_and_wedge(struct usb_ep *_ep, int value, int wedged) { struct dummy_ep *ep; struct dummy *dum; if (!_ep) return -EINVAL; ep = usb_ep_to_dummy_ep(_ep); dum = ep_to_dummy(ep); if (!dum->driver) return -ESHUTDOWN; if (!value) ep->halted = ep->wedged = 0; else if (ep->desc && (ep->desc->bEndpointAddress & USB_DIR_IN) && !list_empty(&ep->queue)) return -EAGAIN; else { ep->halted = 1; if (wedged) ep->wedged = 1; } /* FIXME clear emulated data toggle too */ return 0; } static int dummy_set_halt(struct usb_ep *_ep, int value) { return dummy_set_halt_and_wedge(_ep, value, 0); } static int dummy_set_wedge(struct usb_ep *_ep) { if (!_ep || _ep->name == ep0name) return -EINVAL; return dummy_set_halt_and_wedge(_ep, 1, 1); } static const struct usb_ep_ops dummy_ep_ops = { .enable = dummy_enable, .disable = dummy_disable, .alloc_request = dummy_alloc_request, .free_request = dummy_free_request, .queue = dummy_queue, .dequeue = dummy_dequeue, .set_halt = dummy_set_halt, .set_wedge = dummy_set_wedge, }; /*-------------------------------------------------------------------------*/ /* there are both host and device side versions of this call ... */ static int dummy_g_get_frame(struct usb_gadget *_gadget) { struct timespec64 ts64; ktime_get_ts64(&ts64); return ts64.tv_nsec / NSEC_PER_MSEC; } static int dummy_wakeup(struct usb_gadget *_gadget) { struct dummy_hcd *dum_hcd; dum_hcd = gadget_to_dummy_hcd(_gadget); if (!(dum_hcd->dum->devstatus & ((1 << USB_DEVICE_B_HNP_ENABLE) | (1 << USB_DEVICE_REMOTE_WAKEUP)))) return -EINVAL; if ((dum_hcd->port_status & USB_PORT_STAT_CONNECTION) == 0) return -ENOLINK; if ((dum_hcd->port_status & USB_PORT_STAT_SUSPEND) == 0 && dum_hcd->rh_state != DUMMY_RH_SUSPENDED) return -EIO; /* FIXME: What if the root hub is suspended but the port isn't? */ /* hub notices our request, issues downstream resume, etc */ dum_hcd->resuming = 1; dum_hcd->re_timeout = jiffies + msecs_to_jiffies(20); mod_timer(&dummy_hcd_to_hcd(dum_hcd)->rh_timer, dum_hcd->re_timeout); return 0; } static int dummy_set_selfpower